1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/IntrinsicsAMDGPU.h" 30 #include <optional> 31 32 #define DEBUG_TYPE "amdgpu-isel" 33 34 using namespace llvm; 35 using namespace MIPatternMatch; 36 37 #define GET_GLOBALISEL_IMPL 38 #define AMDGPUSubtarget GCNSubtarget 39 #include "AMDGPUGenGlobalISel.inc" 40 #undef GET_GLOBALISEL_IMPL 41 #undef AMDGPUSubtarget 42 43 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 45 const AMDGPUTargetMachine &TM) 46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 47 STI(STI), 48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), 49 #define GET_GLOBALISEL_PREDICATES_INIT 50 #include "AMDGPUGenGlobalISel.inc" 51 #undef GET_GLOBALISEL_PREDICATES_INIT 52 #define GET_GLOBALISEL_TEMPORARIES_INIT 53 #include "AMDGPUGenGlobalISel.inc" 54 #undef GET_GLOBALISEL_TEMPORARIES_INIT 55 { 56 } 57 58 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 59 60 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, 61 CodeGenCoverage *CoverageInfo, 62 ProfileSummaryInfo *PSI, 63 BlockFrequencyInfo *BFI) { 64 MRI = &MF.getRegInfo(); 65 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 66 Subtarget->checkSubtargetFeatures(MF.getFunction()); 67 InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI); 68 } 69 70 // Return the wave level SGPR base address if this is a wave address. 71 static Register getWaveAddress(const MachineInstr *Def) { 72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS 73 ? Def->getOperand(1).getReg() 74 : Register(); 75 } 76 77 bool AMDGPUInstructionSelector::isVCC(Register Reg, 78 const MachineRegisterInfo &MRI) const { 79 // The verifier is oblivious to s1 being a valid value for wavesize registers. 80 if (Reg.isPhysical()) 81 return false; 82 83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 84 const TargetRegisterClass *RC = 85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>(); 86 if (RC) { 87 const LLT Ty = MRI.getType(Reg); 88 if (!Ty.isValid() || Ty.getSizeInBits() != 1) 89 return false; 90 // G_TRUNC s1 result is never vcc. 91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC && 92 RC->hasSuperClassEq(TRI.getBoolRC()); 93 } 94 95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); 96 return RB->getID() == AMDGPU::VCCRegBankID; 97 } 98 99 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 100 unsigned NewOpc) const { 101 MI.setDesc(TII.get(NewOpc)); 102 MI.removeOperand(1); // Remove intrinsic ID. 103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 104 105 MachineOperand &Dst = MI.getOperand(0); 106 MachineOperand &Src = MI.getOperand(1); 107 108 // TODO: This should be legalized to s32 if needed 109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 110 return false; 111 112 const TargetRegisterClass *DstRC 113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 114 const TargetRegisterClass *SrcRC 115 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 116 if (!DstRC || DstRC != SrcRC) 117 return false; 118 119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 121 } 122 123 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 124 const DebugLoc &DL = I.getDebugLoc(); 125 MachineBasicBlock *BB = I.getParent(); 126 I.setDesc(TII.get(TargetOpcode::COPY)); 127 128 const MachineOperand &Src = I.getOperand(1); 129 MachineOperand &Dst = I.getOperand(0); 130 Register DstReg = Dst.getReg(); 131 Register SrcReg = Src.getReg(); 132 133 if (isVCC(DstReg, *MRI)) { 134 if (SrcReg == AMDGPU::SCC) { 135 const TargetRegisterClass *RC 136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 137 if (!RC) 138 return true; 139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 140 } 141 142 if (!isVCC(SrcReg, *MRI)) { 143 // TODO: Should probably leave the copy and let copyPhysReg expand it. 144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 145 return false; 146 147 const TargetRegisterClass *SrcRC 148 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 149 150 std::optional<ValueAndVReg> ConstVal = 151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true); 152 if (ConstVal) { 153 unsigned MovOpc = 154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) 156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); 157 } else { 158 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 159 160 // We can't trust the high bits at this point, so clear them. 161 162 // TODO: Skip masking high bits if def is known boolean. 163 164 bool IsSGPR = TRI.isSGPRClass(SrcRC); 165 unsigned AndOpc = 166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 167 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 168 .addImm(1) 169 .addReg(SrcReg); 170 if (IsSGPR) 171 And.setOperandDead(3); // Dead scc 172 173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 174 .addImm(0) 175 .addReg(MaskedReg); 176 } 177 178 if (!MRI->getRegClassOrNull(SrcReg)) 179 MRI->setRegClass(SrcReg, SrcRC); 180 I.eraseFromParent(); 181 return true; 182 } 183 184 const TargetRegisterClass *RC = 185 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 186 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 187 return false; 188 189 return true; 190 } 191 192 for (const MachineOperand &MO : I.operands()) { 193 if (MO.getReg().isPhysical()) 194 continue; 195 196 const TargetRegisterClass *RC = 197 TRI.getConstrainedRegClassForOperand(MO, *MRI); 198 if (!RC) 199 continue; 200 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 201 } 202 return true; 203 } 204 205 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 206 const Register DefReg = I.getOperand(0).getReg(); 207 const LLT DefTy = MRI->getType(DefReg); 208 209 // S1 G_PHIs should not be selected in instruction-select, instead: 210 // - divergent S1 G_PHI should go through lane mask merging algorithm 211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering 212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect 213 if (DefTy == LLT::scalar(1)) 214 return false; 215 216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 217 218 const RegClassOrRegBank &RegClassOrBank = 219 MRI->getRegClassOrRegBank(DefReg); 220 221 const TargetRegisterClass *DefRC 222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); 223 if (!DefRC) { 224 if (!DefTy.isValid()) { 225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 226 return false; 227 } 228 229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>(); 230 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB); 231 if (!DefRC) { 232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 233 return false; 234 } 235 } 236 237 // TODO: Verify that all registers have the same bank 238 I.setDesc(TII.get(TargetOpcode::PHI)); 239 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 240 } 241 242 MachineOperand 243 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 244 const TargetRegisterClass &SubRC, 245 unsigned SubIdx) const { 246 247 MachineInstr *MI = MO.getParent(); 248 MachineBasicBlock *BB = MO.getParent()->getParent(); 249 Register DstReg = MRI->createVirtualRegister(&SubRC); 250 251 if (MO.isReg()) { 252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 253 Register Reg = MO.getReg(); 254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 255 .addReg(Reg, 0, ComposedSubIdx); 256 257 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 258 MO.isKill(), MO.isDead(), MO.isUndef(), 259 MO.isEarlyClobber(), 0, MO.isDebug(), 260 MO.isInternalRead()); 261 } 262 263 assert(MO.isImm()); 264 265 APInt Imm(64, MO.getImm()); 266 267 switch (SubIdx) { 268 default: 269 llvm_unreachable("do not know to split immediate with this sub index."); 270 case AMDGPU::sub0: 271 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 272 case AMDGPU::sub1: 273 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 274 } 275 } 276 277 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 278 switch (Opc) { 279 case AMDGPU::G_AND: 280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 281 case AMDGPU::G_OR: 282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 283 case AMDGPU::G_XOR: 284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 285 default: 286 llvm_unreachable("not a bit op"); 287 } 288 } 289 290 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 291 Register DstReg = I.getOperand(0).getReg(); 292 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 293 294 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 295 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 296 DstRB->getID() != AMDGPU::VCCRegBankID) 297 return false; 298 299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 300 STI.isWave64()); 301 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 302 303 // Dead implicit-def of scc 304 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 305 true, // isImp 306 false, // isKill 307 true)); // isDead 308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 309 } 310 311 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 312 MachineBasicBlock *BB = I.getParent(); 313 MachineFunction *MF = BB->getParent(); 314 Register DstReg = I.getOperand(0).getReg(); 315 const DebugLoc &DL = I.getDebugLoc(); 316 LLT Ty = MRI->getType(DstReg); 317 if (Ty.isVector()) 318 return false; 319 320 unsigned Size = Ty.getSizeInBits(); 321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 324 325 if (Size == 32) { 326 if (IsSALU) { 327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 328 MachineInstr *Add = 329 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 330 .add(I.getOperand(1)) 331 .add(I.getOperand(2)) 332 .setOperandDead(3); // Dead scc 333 I.eraseFromParent(); 334 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 335 } 336 337 if (STI.hasAddNoCarry()) { 338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 339 I.setDesc(TII.get(Opc)); 340 I.addOperand(*MF, MachineOperand::CreateImm(0)); 341 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 343 } 344 345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 346 347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 348 MachineInstr *Add 349 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 350 .addDef(UnusedCarry, RegState::Dead) 351 .add(I.getOperand(1)) 352 .add(I.getOperand(2)) 353 .addImm(0); 354 I.eraseFromParent(); 355 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 356 } 357 358 assert(!Sub && "illegal sub should not reach here"); 359 360 const TargetRegisterClass &RC 361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 362 const TargetRegisterClass &HalfRC 363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 364 365 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 366 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 367 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 368 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 369 370 Register DstLo = MRI->createVirtualRegister(&HalfRC); 371 Register DstHi = MRI->createVirtualRegister(&HalfRC); 372 373 if (IsSALU) { 374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 375 .add(Lo1) 376 .add(Lo2); 377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 378 .add(Hi1) 379 .add(Hi2) 380 .setOperandDead(3); // Dead scc 381 } else { 382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 383 Register CarryReg = MRI->createVirtualRegister(CarryRC); 384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 385 .addDef(CarryReg) 386 .add(Lo1) 387 .add(Lo2) 388 .addImm(0); 389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 391 .add(Hi1) 392 .add(Hi2) 393 .addReg(CarryReg, RegState::Kill) 394 .addImm(0); 395 396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 397 return false; 398 } 399 400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 401 .addReg(DstLo) 402 .addImm(AMDGPU::sub0) 403 .addReg(DstHi) 404 .addImm(AMDGPU::sub1); 405 406 407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 408 return false; 409 410 I.eraseFromParent(); 411 return true; 412 } 413 414 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 415 MachineInstr &I) const { 416 MachineBasicBlock *BB = I.getParent(); 417 MachineFunction *MF = BB->getParent(); 418 const DebugLoc &DL = I.getDebugLoc(); 419 Register Dst0Reg = I.getOperand(0).getReg(); 420 Register Dst1Reg = I.getOperand(1).getReg(); 421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 422 I.getOpcode() == AMDGPU::G_UADDE; 423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 424 I.getOpcode() == AMDGPU::G_USUBE; 425 426 if (isVCC(Dst1Reg, *MRI)) { 427 unsigned NoCarryOpc = 428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 430 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 431 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 432 I.addOperand(*MF, MachineOperand::CreateImm(0)); 433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 434 } 435 436 Register Src0Reg = I.getOperand(2).getReg(); 437 Register Src1Reg = I.getOperand(3).getReg(); 438 439 if (HasCarryIn) { 440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 441 .addReg(I.getOperand(4).getReg()); 442 } 443 444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 446 447 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 448 .add(I.getOperand(2)) 449 .add(I.getOperand(3)); 450 451 if (MRI->use_nodbg_empty(Dst1Reg)) { 452 CarryInst.setOperandDead(3); // Dead scc 453 } else { 454 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 455 .addReg(AMDGPU::SCC); 456 if (!MRI->getRegClassOrNull(Dst1Reg)) 457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 458 } 459 460 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 461 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 462 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 463 return false; 464 465 if (HasCarryIn && 466 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 467 AMDGPU::SReg_32RegClass, *MRI)) 468 return false; 469 470 I.eraseFromParent(); 471 return true; 472 } 473 474 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( 475 MachineInstr &I) const { 476 MachineBasicBlock *BB = I.getParent(); 477 MachineFunction *MF = BB->getParent(); 478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 479 480 unsigned Opc; 481 if (Subtarget->hasMADIntraFwdBug()) 482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 483 : AMDGPU::V_MAD_I64_I32_gfx11_e64; 484 else 485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; 486 I.setDesc(TII.get(Opc)); 487 I.addOperand(*MF, MachineOperand::CreateImm(0)); 488 I.addImplicitDefUseOperands(*MF); 489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 490 } 491 492 // TODO: We should probably legalize these to only using 32-bit results. 493 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 494 MachineBasicBlock *BB = I.getParent(); 495 Register DstReg = I.getOperand(0).getReg(); 496 Register SrcReg = I.getOperand(1).getReg(); 497 LLT DstTy = MRI->getType(DstReg); 498 LLT SrcTy = MRI->getType(SrcReg); 499 const unsigned SrcSize = SrcTy.getSizeInBits(); 500 unsigned DstSize = DstTy.getSizeInBits(); 501 502 // TODO: Should handle any multiple of 32 offset. 503 unsigned Offset = I.getOperand(2).getImm(); 504 if (Offset % 32 != 0 || DstSize > 128) 505 return false; 506 507 // 16-bit operations really use 32-bit registers. 508 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 509 if (DstSize == 16) 510 DstSize = 32; 511 512 const TargetRegisterClass *DstRC = 513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 515 return false; 516 517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 518 const TargetRegisterClass *SrcRC = 519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 520 if (!SrcRC) 521 return false; 522 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 523 DstSize / 32); 524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 525 if (!SrcRC) 526 return false; 527 528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 529 *SrcRC, I.getOperand(1)); 530 const DebugLoc &DL = I.getDebugLoc(); 531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 532 .addReg(SrcReg, 0, SubReg); 533 534 I.eraseFromParent(); 535 return true; 536 } 537 538 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 539 MachineBasicBlock *BB = MI.getParent(); 540 Register DstReg = MI.getOperand(0).getReg(); 541 LLT DstTy = MRI->getType(DstReg); 542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 543 544 const unsigned SrcSize = SrcTy.getSizeInBits(); 545 if (SrcSize < 32) 546 return selectImpl(MI, *CoverageInfo); 547 548 const DebugLoc &DL = MI.getDebugLoc(); 549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 550 const unsigned DstSize = DstTy.getSizeInBits(); 551 const TargetRegisterClass *DstRC = 552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 553 if (!DstRC) 554 return false; 555 556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 557 MachineInstrBuilder MIB = 558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 560 MachineOperand &Src = MI.getOperand(I + 1); 561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 562 MIB.addImm(SubRegs[I]); 563 564 const TargetRegisterClass *SrcRC 565 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 567 return false; 568 } 569 570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 571 return false; 572 573 MI.eraseFromParent(); 574 return true; 575 } 576 577 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 578 MachineBasicBlock *BB = MI.getParent(); 579 const int NumDst = MI.getNumOperands() - 1; 580 581 MachineOperand &Src = MI.getOperand(NumDst); 582 583 Register SrcReg = Src.getReg(); 584 Register DstReg0 = MI.getOperand(0).getReg(); 585 LLT DstTy = MRI->getType(DstReg0); 586 LLT SrcTy = MRI->getType(SrcReg); 587 588 const unsigned DstSize = DstTy.getSizeInBits(); 589 const unsigned SrcSize = SrcTy.getSizeInBits(); 590 const DebugLoc &DL = MI.getDebugLoc(); 591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 592 593 const TargetRegisterClass *SrcRC = 594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 596 return false; 597 598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 599 // source, and this relies on the fact that the same subregister indices are 600 // used for both. 601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 602 for (int I = 0, E = NumDst; I != E; ++I) { 603 MachineOperand &Dst = MI.getOperand(I); 604 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 605 .addReg(SrcReg, 0, SubRegs[I]); 606 607 // Make sure the subregister index is valid for the source register. 608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 609 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 610 return false; 611 612 const TargetRegisterClass *DstRC = 613 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 614 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 615 return false; 616 } 617 618 MI.eraseFromParent(); 619 return true; 620 } 621 622 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { 623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC || 624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR); 625 626 Register Src0 = MI.getOperand(1).getReg(); 627 Register Src1 = MI.getOperand(2).getReg(); 628 LLT SrcTy = MRI->getType(Src0); 629 const unsigned SrcSize = SrcTy.getSizeInBits(); 630 631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE. 632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) { 633 return selectG_MERGE_VALUES(MI); 634 } 635 636 // Selection logic below is for V2S16 only. 637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32. 638 Register Dst = MI.getOperand(0).getReg(); 639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) || 640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC && 641 SrcTy != LLT::scalar(32))) 642 return selectImpl(MI, *CoverageInfo); 643 644 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 645 if (DstBank->getID() == AMDGPU::AGPRRegBankID) 646 return false; 647 648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID || 649 DstBank->getID() == AMDGPU::VGPRRegBankID); 650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID; 651 652 const DebugLoc &DL = MI.getDebugLoc(); 653 MachineBasicBlock *BB = MI.getParent(); 654 655 // First, before trying TableGen patterns, check if both sources are 656 // constants. In those cases, we can trivially compute the final constant 657 // and emit a simple move. 658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); 659 if (ConstSrc1) { 660 auto ConstSrc0 = 661 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true); 662 if (ConstSrc0) { 663 const int64_t K0 = ConstSrc0->Value.getSExtValue(); 664 const int64_t K1 = ConstSrc1->Value.getSExtValue(); 665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff; 666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff; 667 uint32_t Imm = Lo16 | (Hi16 << 16); 668 669 // VALU 670 if (IsVector) { 671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm); 672 MI.eraseFromParent(); 673 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI); 674 } 675 676 // SALU 677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm); 678 MI.eraseFromParent(); 679 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 680 } 681 } 682 683 // Now try TableGen patterns. 684 if (selectImpl(MI, *CoverageInfo)) 685 return true; 686 687 // TODO: This should probably be a combine somewhere 688 // (build_vector $src0, undef) -> copy $src0 689 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 691 MI.setDesc(TII.get(AMDGPU::COPY)); 692 MI.removeOperand(2); 693 const auto &RC = 694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 695 return RBI.constrainGenericRegister(Dst, RC, *MRI) && 696 RBI.constrainGenericRegister(Src0, RC, *MRI); 697 } 698 699 // TODO: Can be improved? 700 if (IsVector) { 701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 702 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) 703 .addImm(0xFFFF) 704 .addReg(Src0); 705 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) 706 return false; 707 708 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) 709 .addReg(Src1) 710 .addImm(16) 711 .addReg(TmpReg); 712 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) 713 return false; 714 715 MI.eraseFromParent(); 716 return true; 717 } 718 719 Register ShiftSrc0; 720 Register ShiftSrc1; 721 722 // With multiple uses of the shift, this will duplicate the shift and 723 // increase register pressure. 724 // 725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 726 // => (S_PACK_HH_B32_B16 $src0, $src1) 727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1) 728 // => (S_PACK_HL_B32_B16 $src0, $src1) 729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16)) 730 // => (S_PACK_LH_B32_B16 $src0, $src1) 731 // (build_vector $src0, $src1) 732 // => (S_PACK_LL_B32_B16 $src0, $src1) 733 734 bool Shift0 = mi_match( 735 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); 736 737 bool Shift1 = mi_match( 738 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); 739 740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 741 if (Shift0 && Shift1) { 742 Opc = AMDGPU::S_PACK_HH_B32_B16; 743 MI.getOperand(1).setReg(ShiftSrc0); 744 MI.getOperand(2).setReg(ShiftSrc1); 745 } else if (Shift1) { 746 Opc = AMDGPU::S_PACK_LH_B32_B16; 747 MI.getOperand(2).setReg(ShiftSrc1); 748 } else if (Shift0) { 749 auto ConstSrc1 = 750 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); 751 if (ConstSrc1 && ConstSrc1->Value == 0) { 752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 753 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 754 .addReg(ShiftSrc0) 755 .addImm(16) 756 .setOperandDead(3); // Dead scc 757 758 MI.eraseFromParent(); 759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 760 } 761 if (STI.hasSPackHL()) { 762 Opc = AMDGPU::S_PACK_HL_B32_B16; 763 MI.getOperand(1).setReg(ShiftSrc0); 764 } 765 } 766 767 MI.setDesc(TII.get(Opc)); 768 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 769 } 770 771 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 772 const MachineOperand &MO = I.getOperand(0); 773 774 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 775 // regbank check here is to know why getConstrainedRegClassForOperand failed. 776 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 777 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 778 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 779 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 780 return true; 781 } 782 783 return false; 784 } 785 786 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 787 MachineBasicBlock *BB = I.getParent(); 788 789 Register DstReg = I.getOperand(0).getReg(); 790 Register Src0Reg = I.getOperand(1).getReg(); 791 Register Src1Reg = I.getOperand(2).getReg(); 792 LLT Src1Ty = MRI->getType(Src1Reg); 793 794 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 795 unsigned InsSize = Src1Ty.getSizeInBits(); 796 797 int64_t Offset = I.getOperand(3).getImm(); 798 799 // FIXME: These cases should have been illegal and unnecessary to check here. 800 if (Offset % 32 != 0 || InsSize % 32 != 0) 801 return false; 802 803 // Currently not handled by getSubRegFromChannel. 804 if (InsSize > 128) 805 return false; 806 807 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 808 if (SubReg == AMDGPU::NoSubRegister) 809 return false; 810 811 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 812 const TargetRegisterClass *DstRC = 813 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 814 if (!DstRC) 815 return false; 816 817 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 818 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 819 const TargetRegisterClass *Src0RC = 820 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank); 821 const TargetRegisterClass *Src1RC = 822 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank); 823 824 // Deal with weird cases where the class only partially supports the subreg 825 // index. 826 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 827 if (!Src0RC || !Src1RC) 828 return false; 829 830 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 831 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 832 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 833 return false; 834 835 const DebugLoc &DL = I.getDebugLoc(); 836 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 837 .addReg(Src0Reg) 838 .addReg(Src1Reg) 839 .addImm(SubReg); 840 841 I.eraseFromParent(); 842 return true; 843 } 844 845 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { 846 Register DstReg = MI.getOperand(0).getReg(); 847 Register SrcReg = MI.getOperand(1).getReg(); 848 Register OffsetReg = MI.getOperand(2).getReg(); 849 Register WidthReg = MI.getOperand(3).getReg(); 850 851 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && 852 "scalar BFX instructions are expanded in regbankselect"); 853 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && 854 "64-bit vector BFX instructions are expanded in regbankselect"); 855 856 const DebugLoc &DL = MI.getDebugLoc(); 857 MachineBasicBlock *MBB = MI.getParent(); 858 859 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX; 860 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 861 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg) 862 .addReg(SrcReg) 863 .addReg(OffsetReg) 864 .addReg(WidthReg); 865 MI.eraseFromParent(); 866 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 867 } 868 869 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 870 if (STI.getLDSBankCount() != 16) 871 return selectImpl(MI, *CoverageInfo); 872 873 Register Dst = MI.getOperand(0).getReg(); 874 Register Src0 = MI.getOperand(2).getReg(); 875 Register M0Val = MI.getOperand(6).getReg(); 876 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 877 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 878 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 879 return false; 880 881 // This requires 2 instructions. It is possible to write a pattern to support 882 // this, but the generated isel emitter doesn't correctly deal with multiple 883 // output instructions using the same physical register input. The copy to m0 884 // is incorrectly placed before the second instruction. 885 // 886 // TODO: Match source modifiers. 887 888 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 889 const DebugLoc &DL = MI.getDebugLoc(); 890 MachineBasicBlock *MBB = MI.getParent(); 891 892 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 893 .addReg(M0Val); 894 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 895 .addImm(2) 896 .addImm(MI.getOperand(4).getImm()) // $attr 897 .addImm(MI.getOperand(3).getImm()); // $attrchan 898 899 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 900 .addImm(0) // $src0_modifiers 901 .addReg(Src0) // $src0 902 .addImm(MI.getOperand(4).getImm()) // $attr 903 .addImm(MI.getOperand(3).getImm()) // $attrchan 904 .addImm(0) // $src2_modifiers 905 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 906 .addImm(MI.getOperand(5).getImm()) // $high 907 .addImm(0) // $clamp 908 .addImm(0); // $omod 909 910 MI.eraseFromParent(); 911 return true; 912 } 913 914 // Writelane is special in that it can use SGPR and M0 (which would normally 915 // count as using the constant bus twice - but in this case it is allowed since 916 // the lane selector doesn't count as a use of the constant bus). However, it is 917 // still required to abide by the 1 SGPR rule. Fix this up if we might have 918 // multiple SGPRs. 919 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { 920 // With a constant bus limit of at least 2, there's no issue. 921 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) 922 return selectImpl(MI, *CoverageInfo); 923 924 MachineBasicBlock *MBB = MI.getParent(); 925 const DebugLoc &DL = MI.getDebugLoc(); 926 Register VDst = MI.getOperand(0).getReg(); 927 Register Val = MI.getOperand(2).getReg(); 928 Register LaneSelect = MI.getOperand(3).getReg(); 929 Register VDstIn = MI.getOperand(4).getReg(); 930 931 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); 932 933 std::optional<ValueAndVReg> ConstSelect = 934 getIConstantVRegValWithLookThrough(LaneSelect, *MRI); 935 if (ConstSelect) { 936 // The selector has to be an inline immediate, so we can use whatever for 937 // the other operands. 938 MIB.addReg(Val); 939 MIB.addImm(ConstSelect->Value.getSExtValue() & 940 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2())); 941 } else { 942 std::optional<ValueAndVReg> ConstVal = 943 getIConstantVRegValWithLookThrough(Val, *MRI); 944 945 // If the value written is an inline immediate, we can get away without a 946 // copy to m0. 947 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(), 948 STI.hasInv2PiInlineImm())) { 949 MIB.addImm(ConstVal->Value.getSExtValue()); 950 MIB.addReg(LaneSelect); 951 } else { 952 MIB.addReg(Val); 953 954 // If the lane selector was originally in a VGPR and copied with 955 // readfirstlane, there's a hazard to read the same SGPR from the 956 // VALU. Constrain to a different SGPR to help avoid needing a nop later. 957 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); 958 959 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 960 .addReg(LaneSelect); 961 MIB.addReg(AMDGPU::M0); 962 } 963 } 964 965 MIB.addReg(VDstIn); 966 967 MI.eraseFromParent(); 968 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 969 } 970 971 // We need to handle this here because tablegen doesn't support matching 972 // instructions with multiple outputs. 973 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 974 Register Dst0 = MI.getOperand(0).getReg(); 975 Register Dst1 = MI.getOperand(1).getReg(); 976 977 LLT Ty = MRI->getType(Dst0); 978 unsigned Opc; 979 if (Ty == LLT::scalar(32)) 980 Opc = AMDGPU::V_DIV_SCALE_F32_e64; 981 else if (Ty == LLT::scalar(64)) 982 Opc = AMDGPU::V_DIV_SCALE_F64_e64; 983 else 984 return false; 985 986 // TODO: Match source modifiers. 987 988 const DebugLoc &DL = MI.getDebugLoc(); 989 MachineBasicBlock *MBB = MI.getParent(); 990 991 Register Numer = MI.getOperand(3).getReg(); 992 Register Denom = MI.getOperand(4).getReg(); 993 unsigned ChooseDenom = MI.getOperand(5).getImm(); 994 995 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 996 997 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 998 .addDef(Dst1) 999 .addImm(0) // $src0_modifiers 1000 .addUse(Src0) // $src0 1001 .addImm(0) // $src1_modifiers 1002 .addUse(Denom) // $src1 1003 .addImm(0) // $src2_modifiers 1004 .addUse(Numer) // $src2 1005 .addImm(0) // $clamp 1006 .addImm(0); // $omod 1007 1008 MI.eraseFromParent(); 1009 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1010 } 1011 1012 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 1013 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); 1014 switch (IntrinsicID) { 1015 case Intrinsic::amdgcn_if_break: { 1016 MachineBasicBlock *BB = I.getParent(); 1017 1018 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1019 // SelectionDAG uses for wave32 vs wave64. 1020 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 1021 .add(I.getOperand(0)) 1022 .add(I.getOperand(2)) 1023 .add(I.getOperand(3)); 1024 1025 Register DstReg = I.getOperand(0).getReg(); 1026 Register Src0Reg = I.getOperand(2).getReg(); 1027 Register Src1Reg = I.getOperand(3).getReg(); 1028 1029 I.eraseFromParent(); 1030 1031 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 1032 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1033 1034 return true; 1035 } 1036 case Intrinsic::amdgcn_interp_p1_f16: 1037 return selectInterpP1F16(I); 1038 case Intrinsic::amdgcn_wqm: 1039 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 1040 case Intrinsic::amdgcn_softwqm: 1041 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 1042 case Intrinsic::amdgcn_strict_wwm: 1043 case Intrinsic::amdgcn_wwm: 1044 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); 1045 case Intrinsic::amdgcn_strict_wqm: 1046 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); 1047 case Intrinsic::amdgcn_writelane: 1048 return selectWritelane(I); 1049 case Intrinsic::amdgcn_div_scale: 1050 return selectDivScale(I); 1051 case Intrinsic::amdgcn_icmp: 1052 case Intrinsic::amdgcn_fcmp: 1053 if (selectImpl(I, *CoverageInfo)) 1054 return true; 1055 return selectIntrinsicCmp(I); 1056 case Intrinsic::amdgcn_ballot: 1057 return selectBallot(I); 1058 case Intrinsic::amdgcn_reloc_constant: 1059 return selectRelocConstant(I); 1060 case Intrinsic::amdgcn_groupstaticsize: 1061 return selectGroupStaticSize(I); 1062 case Intrinsic::returnaddress: 1063 return selectReturnAddress(I); 1064 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 1065 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 1066 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 1067 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 1068 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 1069 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 1070 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 1071 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 1072 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 1073 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 1074 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 1075 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 1076 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 1077 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: 1078 return selectSMFMACIntrin(I); 1079 default: 1080 return selectImpl(I, *CoverageInfo); 1081 } 1082 } 1083 1084 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, 1085 const GCNSubtarget &ST) { 1086 if (Size != 16 && Size != 32 && Size != 64) 1087 return -1; 1088 1089 if (Size == 16 && !ST.has16BitInsts()) 1090 return -1; 1091 1092 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc, 1093 unsigned S64Opc) { 1094 if (Size == 16) 1095 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc; 1096 if (Size == 32) 1097 return S32Opc; 1098 return S64Opc; 1099 }; 1100 1101 switch (P) { 1102 default: 1103 llvm_unreachable("Unknown condition code!"); 1104 case CmpInst::ICMP_NE: 1105 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64, 1106 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64); 1107 case CmpInst::ICMP_EQ: 1108 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64, 1109 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64); 1110 case CmpInst::ICMP_SGT: 1111 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64, 1112 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64); 1113 case CmpInst::ICMP_SGE: 1114 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64, 1115 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64); 1116 case CmpInst::ICMP_SLT: 1117 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64, 1118 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64); 1119 case CmpInst::ICMP_SLE: 1120 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64, 1121 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64); 1122 case CmpInst::ICMP_UGT: 1123 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64, 1124 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64); 1125 case CmpInst::ICMP_UGE: 1126 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64, 1127 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64); 1128 case CmpInst::ICMP_ULT: 1129 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64, 1130 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64); 1131 case CmpInst::ICMP_ULE: 1132 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64, 1133 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64); 1134 1135 case CmpInst::FCMP_OEQ: 1136 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64, 1137 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64); 1138 case CmpInst::FCMP_OGT: 1139 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64, 1140 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64); 1141 case CmpInst::FCMP_OGE: 1142 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64, 1143 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64); 1144 case CmpInst::FCMP_OLT: 1145 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64, 1146 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64); 1147 case CmpInst::FCMP_OLE: 1148 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64, 1149 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64); 1150 case CmpInst::FCMP_ONE: 1151 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, 1152 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64); 1153 case CmpInst::FCMP_ORD: 1154 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64, 1155 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64); 1156 case CmpInst::FCMP_UNO: 1157 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64, 1158 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64); 1159 case CmpInst::FCMP_UEQ: 1160 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64, 1161 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64); 1162 case CmpInst::FCMP_UGT: 1163 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64, 1164 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64); 1165 case CmpInst::FCMP_UGE: 1166 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64, 1167 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64); 1168 case CmpInst::FCMP_ULT: 1169 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64, 1170 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64); 1171 case CmpInst::FCMP_ULE: 1172 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64, 1173 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64); 1174 case CmpInst::FCMP_UNE: 1175 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, 1176 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64); 1177 case CmpInst::FCMP_TRUE: 1178 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64, 1179 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64); 1180 case CmpInst::FCMP_FALSE: 1181 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64, 1182 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64); 1183 } 1184 } 1185 1186 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 1187 unsigned Size) const { 1188 if (Size == 64) { 1189 if (!STI.hasScalarCompareEq64()) 1190 return -1; 1191 1192 switch (P) { 1193 case CmpInst::ICMP_NE: 1194 return AMDGPU::S_CMP_LG_U64; 1195 case CmpInst::ICMP_EQ: 1196 return AMDGPU::S_CMP_EQ_U64; 1197 default: 1198 return -1; 1199 } 1200 } 1201 1202 if (Size == 32) { 1203 switch (P) { 1204 case CmpInst::ICMP_NE: 1205 return AMDGPU::S_CMP_LG_U32; 1206 case CmpInst::ICMP_EQ: 1207 return AMDGPU::S_CMP_EQ_U32; 1208 case CmpInst::ICMP_SGT: 1209 return AMDGPU::S_CMP_GT_I32; 1210 case CmpInst::ICMP_SGE: 1211 return AMDGPU::S_CMP_GE_I32; 1212 case CmpInst::ICMP_SLT: 1213 return AMDGPU::S_CMP_LT_I32; 1214 case CmpInst::ICMP_SLE: 1215 return AMDGPU::S_CMP_LE_I32; 1216 case CmpInst::ICMP_UGT: 1217 return AMDGPU::S_CMP_GT_U32; 1218 case CmpInst::ICMP_UGE: 1219 return AMDGPU::S_CMP_GE_U32; 1220 case CmpInst::ICMP_ULT: 1221 return AMDGPU::S_CMP_LT_U32; 1222 case CmpInst::ICMP_ULE: 1223 return AMDGPU::S_CMP_LE_U32; 1224 case CmpInst::FCMP_OEQ: 1225 return AMDGPU::S_CMP_EQ_F32; 1226 case CmpInst::FCMP_OGT: 1227 return AMDGPU::S_CMP_GT_F32; 1228 case CmpInst::FCMP_OGE: 1229 return AMDGPU::S_CMP_GE_F32; 1230 case CmpInst::FCMP_OLT: 1231 return AMDGPU::S_CMP_LT_F32; 1232 case CmpInst::FCMP_OLE: 1233 return AMDGPU::S_CMP_LE_F32; 1234 case CmpInst::FCMP_ONE: 1235 return AMDGPU::S_CMP_LG_F32; 1236 case CmpInst::FCMP_ORD: 1237 return AMDGPU::S_CMP_O_F32; 1238 case CmpInst::FCMP_UNO: 1239 return AMDGPU::S_CMP_U_F32; 1240 case CmpInst::FCMP_UEQ: 1241 return AMDGPU::S_CMP_NLG_F32; 1242 case CmpInst::FCMP_UGT: 1243 return AMDGPU::S_CMP_NLE_F32; 1244 case CmpInst::FCMP_UGE: 1245 return AMDGPU::S_CMP_NLT_F32; 1246 case CmpInst::FCMP_ULT: 1247 return AMDGPU::S_CMP_NGE_F32; 1248 case CmpInst::FCMP_ULE: 1249 return AMDGPU::S_CMP_NGT_F32; 1250 case CmpInst::FCMP_UNE: 1251 return AMDGPU::S_CMP_NEQ_F32; 1252 default: 1253 llvm_unreachable("Unknown condition code!"); 1254 } 1255 } 1256 1257 if (Size == 16) { 1258 if (!STI.hasSALUFloatInsts()) 1259 return -1; 1260 1261 switch (P) { 1262 case CmpInst::FCMP_OEQ: 1263 return AMDGPU::S_CMP_EQ_F16; 1264 case CmpInst::FCMP_OGT: 1265 return AMDGPU::S_CMP_GT_F16; 1266 case CmpInst::FCMP_OGE: 1267 return AMDGPU::S_CMP_GE_F16; 1268 case CmpInst::FCMP_OLT: 1269 return AMDGPU::S_CMP_LT_F16; 1270 case CmpInst::FCMP_OLE: 1271 return AMDGPU::S_CMP_LE_F16; 1272 case CmpInst::FCMP_ONE: 1273 return AMDGPU::S_CMP_LG_F16; 1274 case CmpInst::FCMP_ORD: 1275 return AMDGPU::S_CMP_O_F16; 1276 case CmpInst::FCMP_UNO: 1277 return AMDGPU::S_CMP_U_F16; 1278 case CmpInst::FCMP_UEQ: 1279 return AMDGPU::S_CMP_NLG_F16; 1280 case CmpInst::FCMP_UGT: 1281 return AMDGPU::S_CMP_NLE_F16; 1282 case CmpInst::FCMP_UGE: 1283 return AMDGPU::S_CMP_NLT_F16; 1284 case CmpInst::FCMP_ULT: 1285 return AMDGPU::S_CMP_NGE_F16; 1286 case CmpInst::FCMP_ULE: 1287 return AMDGPU::S_CMP_NGT_F16; 1288 case CmpInst::FCMP_UNE: 1289 return AMDGPU::S_CMP_NEQ_F16; 1290 default: 1291 llvm_unreachable("Unknown condition code!"); 1292 } 1293 } 1294 1295 return -1; 1296 } 1297 1298 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { 1299 1300 MachineBasicBlock *BB = I.getParent(); 1301 const DebugLoc &DL = I.getDebugLoc(); 1302 1303 Register SrcReg = I.getOperand(2).getReg(); 1304 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1305 1306 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 1307 1308 Register CCReg = I.getOperand(0).getReg(); 1309 if (!isVCC(CCReg, *MRI)) { 1310 int Opcode = getS_CMPOpcode(Pred, Size); 1311 if (Opcode == -1) 1312 return false; 1313 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1314 .add(I.getOperand(2)) 1315 .add(I.getOperand(3)); 1316 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1317 .addReg(AMDGPU::SCC); 1318 bool Ret = 1319 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1320 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1321 I.eraseFromParent(); 1322 return Ret; 1323 } 1324 1325 if (I.getOpcode() == AMDGPU::G_FCMP) 1326 return false; 1327 1328 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); 1329 if (Opcode == -1) 1330 return false; 1331 1332 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), 1333 I.getOperand(0).getReg()) 1334 .add(I.getOperand(2)) 1335 .add(I.getOperand(3)); 1336 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1337 *TRI.getBoolRC(), *MRI); 1338 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1339 I.eraseFromParent(); 1340 return Ret; 1341 } 1342 1343 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { 1344 Register Dst = I.getOperand(0).getReg(); 1345 if (isVCC(Dst, *MRI)) 1346 return false; 1347 1348 LLT DstTy = MRI->getType(Dst); 1349 if (DstTy.getSizeInBits() != STI.getWavefrontSize()) 1350 return false; 1351 1352 MachineBasicBlock *BB = I.getParent(); 1353 const DebugLoc &DL = I.getDebugLoc(); 1354 Register SrcReg = I.getOperand(2).getReg(); 1355 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1356 1357 // i1 inputs are not supported in GlobalISel. 1358 if (Size == 1) 1359 return false; 1360 1361 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1362 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) { 1363 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst); 1364 I.eraseFromParent(); 1365 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); 1366 } 1367 1368 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); 1369 if (Opcode == -1) 1370 return false; 1371 1372 MachineInstrBuilder SelectedMI; 1373 MachineOperand &LHS = I.getOperand(2); 1374 MachineOperand &RHS = I.getOperand(3); 1375 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS); 1376 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS); 1377 Register Src0Reg = 1378 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); 1379 Register Src1Reg = 1380 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true); 1381 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst); 1382 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) 1383 SelectedMI.addImm(Src0Mods); 1384 SelectedMI.addReg(Src0Reg); 1385 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers)) 1386 SelectedMI.addImm(Src1Mods); 1387 SelectedMI.addReg(Src1Reg); 1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp)) 1389 SelectedMI.addImm(0); // clamp 1390 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) 1391 SelectedMI.addImm(0); // op_sel 1392 1393 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); 1394 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI)) 1395 return false; 1396 1397 I.eraseFromParent(); 1398 return true; 1399 } 1400 1401 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1402 MachineBasicBlock *BB = I.getParent(); 1403 const DebugLoc &DL = I.getDebugLoc(); 1404 Register DstReg = I.getOperand(0).getReg(); 1405 const unsigned Size = MRI->getType(DstReg).getSizeInBits(); 1406 const bool Is64 = Size == 64; 1407 const bool IsWave32 = (STI.getWavefrontSize() == 32); 1408 1409 // In the common case, the return type matches the wave size. 1410 // However we also support emitting i64 ballots in wave32 mode. 1411 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32)) 1412 return false; 1413 1414 std::optional<ValueAndVReg> Arg = 1415 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); 1416 1417 const auto BuildCopy = [&](Register SrcReg) { 1418 if (Size == STI.getWavefrontSize()) { 1419 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1420 .addReg(SrcReg); 1421 return; 1422 } 1423 1424 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes. 1425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1426 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); 1427 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1428 .addReg(SrcReg) 1429 .addImm(AMDGPU::sub0) 1430 .addReg(HiReg) 1431 .addImm(AMDGPU::sub1); 1432 }; 1433 1434 if (Arg) { 1435 const int64_t Value = Arg->Value.getSExtValue(); 1436 if (Value == 0) { 1437 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1438 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 1439 } else if (Value == -1) // all ones 1440 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); 1441 else 1442 return false; 1443 } else 1444 BuildCopy(I.getOperand(2).getReg()); 1445 1446 I.eraseFromParent(); 1447 return true; 1448 } 1449 1450 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1451 Register DstReg = I.getOperand(0).getReg(); 1452 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1453 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank); 1454 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1455 return false; 1456 1457 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1458 1459 Module *M = MF->getFunction().getParent(); 1460 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1461 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1462 auto RelocSymbol = cast<GlobalVariable>( 1463 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1464 1465 MachineBasicBlock *BB = I.getParent(); 1466 BuildMI(*BB, &I, I.getDebugLoc(), 1467 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1468 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1469 1470 I.eraseFromParent(); 1471 return true; 1472 } 1473 1474 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { 1475 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS(); 1476 1477 Register DstReg = I.getOperand(0).getReg(); 1478 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1479 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ? 1480 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1481 1482 MachineBasicBlock *MBB = I.getParent(); 1483 const DebugLoc &DL = I.getDebugLoc(); 1484 1485 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg); 1486 1487 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) { 1488 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1489 MIB.addImm(MFI->getLDSSize()); 1490 } else { 1491 Module *M = MF->getFunction().getParent(); 1492 const GlobalValue *GV 1493 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize); 1494 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 1495 } 1496 1497 I.eraseFromParent(); 1498 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1499 } 1500 1501 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1502 MachineBasicBlock *MBB = I.getParent(); 1503 MachineFunction &MF = *MBB->getParent(); 1504 const DebugLoc &DL = I.getDebugLoc(); 1505 1506 MachineOperand &Dst = I.getOperand(0); 1507 Register DstReg = Dst.getReg(); 1508 unsigned Depth = I.getOperand(2).getImm(); 1509 1510 const TargetRegisterClass *RC 1511 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1512 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1513 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1514 return false; 1515 1516 // Check for kernel and shader functions 1517 if (Depth != 0 || 1518 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1519 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1520 .addImm(0); 1521 I.eraseFromParent(); 1522 return true; 1523 } 1524 1525 MachineFrameInfo &MFI = MF.getFrameInfo(); 1526 // There is a call to @llvm.returnaddress in this function 1527 MFI.setReturnAddressIsTaken(true); 1528 1529 // Get the return address reg and mark it as an implicit live-in 1530 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1531 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1532 AMDGPU::SReg_64RegClass, DL); 1533 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1534 .addReg(LiveIn); 1535 I.eraseFromParent(); 1536 return true; 1537 } 1538 1539 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1540 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1541 // SelectionDAG uses for wave32 vs wave64. 1542 MachineBasicBlock *BB = MI.getParent(); 1543 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1544 .add(MI.getOperand(1)); 1545 1546 Register Reg = MI.getOperand(1).getReg(); 1547 MI.eraseFromParent(); 1548 1549 if (!MRI->getRegClassOrNull(Reg)) 1550 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1551 return true; 1552 } 1553 1554 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1555 MachineInstr &MI, Intrinsic::ID IntrID) const { 1556 MachineBasicBlock *MBB = MI.getParent(); 1557 MachineFunction *MF = MBB->getParent(); 1558 const DebugLoc &DL = MI.getDebugLoc(); 1559 1560 unsigned IndexOperand = MI.getOperand(7).getImm(); 1561 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1562 bool WaveDone = MI.getOperand(9).getImm() != 0; 1563 1564 if (WaveDone && !WaveRelease) 1565 report_fatal_error("ds_ordered_count: wave_done requires wave_release"); 1566 1567 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1568 IndexOperand &= ~0x3f; 1569 unsigned CountDw = 0; 1570 1571 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1572 CountDw = (IndexOperand >> 24) & 0xf; 1573 IndexOperand &= ~(0xf << 24); 1574 1575 if (CountDw < 1 || CountDw > 4) { 1576 report_fatal_error( 1577 "ds_ordered_count: dword count must be between 1 and 4"); 1578 } 1579 } 1580 1581 if (IndexOperand) 1582 report_fatal_error("ds_ordered_count: bad index operand"); 1583 1584 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1585 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1586 1587 unsigned Offset0 = OrderedCountIndex << 2; 1588 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); 1589 1590 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1591 Offset1 |= (CountDw - 1) << 6; 1592 1593 if (STI.getGeneration() < AMDGPUSubtarget::GFX11) 1594 Offset1 |= ShaderType << 2; 1595 1596 unsigned Offset = Offset0 | (Offset1 << 8); 1597 1598 Register M0Val = MI.getOperand(2).getReg(); 1599 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1600 .addReg(M0Val); 1601 1602 Register DstReg = MI.getOperand(0).getReg(); 1603 Register ValReg = MI.getOperand(3).getReg(); 1604 MachineInstrBuilder DS = 1605 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1606 .addReg(ValReg) 1607 .addImm(Offset) 1608 .cloneMemRefs(MI); 1609 1610 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1611 return false; 1612 1613 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1614 MI.eraseFromParent(); 1615 return Ret; 1616 } 1617 1618 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1619 switch (IntrID) { 1620 case Intrinsic::amdgcn_ds_gws_init: 1621 return AMDGPU::DS_GWS_INIT; 1622 case Intrinsic::amdgcn_ds_gws_barrier: 1623 return AMDGPU::DS_GWS_BARRIER; 1624 case Intrinsic::amdgcn_ds_gws_sema_v: 1625 return AMDGPU::DS_GWS_SEMA_V; 1626 case Intrinsic::amdgcn_ds_gws_sema_br: 1627 return AMDGPU::DS_GWS_SEMA_BR; 1628 case Intrinsic::amdgcn_ds_gws_sema_p: 1629 return AMDGPU::DS_GWS_SEMA_P; 1630 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1631 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1632 default: 1633 llvm_unreachable("not a gws intrinsic"); 1634 } 1635 } 1636 1637 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1638 Intrinsic::ID IID) const { 1639 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1640 !STI.hasGWSSemaReleaseAll())) 1641 return false; 1642 1643 // intrinsic ID, vsrc, offset 1644 const bool HasVSrc = MI.getNumOperands() == 3; 1645 assert(HasVSrc || MI.getNumOperands() == 2); 1646 1647 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1648 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1649 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1650 return false; 1651 1652 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1653 unsigned ImmOffset; 1654 1655 MachineBasicBlock *MBB = MI.getParent(); 1656 const DebugLoc &DL = MI.getDebugLoc(); 1657 1658 MachineInstr *Readfirstlane = nullptr; 1659 1660 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1661 // incoming offset, in case there's an add of a constant. We'll have to put it 1662 // back later. 1663 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1664 Readfirstlane = OffsetDef; 1665 BaseOffset = OffsetDef->getOperand(1).getReg(); 1666 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1667 } 1668 1669 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1670 // If we have a constant offset, try to use the 0 in m0 as the base. 1671 // TODO: Look into changing the default m0 initialization value. If the 1672 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1673 // the immediate offset. 1674 1675 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1676 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1677 .addImm(0); 1678 } else { 1679 std::tie(BaseOffset, ImmOffset) = 1680 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB); 1681 1682 if (Readfirstlane) { 1683 // We have the constant offset now, so put the readfirstlane back on the 1684 // variable component. 1685 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1686 return false; 1687 1688 Readfirstlane->getOperand(1).setReg(BaseOffset); 1689 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1690 } else { 1691 if (!RBI.constrainGenericRegister(BaseOffset, 1692 AMDGPU::SReg_32RegClass, *MRI)) 1693 return false; 1694 } 1695 1696 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1697 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1698 .addReg(BaseOffset) 1699 .addImm(16) 1700 .setOperandDead(3); // Dead scc 1701 1702 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1703 .addReg(M0Base); 1704 } 1705 1706 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1707 // offset field) % 64. Some versions of the programming guide omit the m0 1708 // part, or claim it's from offset 0. 1709 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1710 1711 if (HasVSrc) { 1712 Register VSrc = MI.getOperand(1).getReg(); 1713 MIB.addReg(VSrc); 1714 1715 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1716 return false; 1717 } 1718 1719 MIB.addImm(ImmOffset) 1720 .cloneMemRefs(MI); 1721 1722 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); 1723 1724 MI.eraseFromParent(); 1725 return true; 1726 } 1727 1728 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1729 bool IsAppend) const { 1730 Register PtrBase = MI.getOperand(2).getReg(); 1731 LLT PtrTy = MRI->getType(PtrBase); 1732 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1733 1734 unsigned Offset; 1735 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1736 1737 // TODO: Should this try to look through readfirstlane like GWS? 1738 if (!isDSOffsetLegal(PtrBase, Offset)) { 1739 PtrBase = MI.getOperand(2).getReg(); 1740 Offset = 0; 1741 } 1742 1743 MachineBasicBlock *MBB = MI.getParent(); 1744 const DebugLoc &DL = MI.getDebugLoc(); 1745 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1746 1747 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1748 .addReg(PtrBase); 1749 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1750 return false; 1751 1752 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1753 .addImm(Offset) 1754 .addImm(IsGDS ? -1 : 0) 1755 .cloneMemRefs(MI); 1756 MI.eraseFromParent(); 1757 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1758 } 1759 1760 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { 1761 if (TM.getOptLevel() > CodeGenOptLevel::None) { 1762 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; 1763 if (WGSize <= STI.getWavefrontSize()) { 1764 MachineBasicBlock *MBB = MI.getParent(); 1765 const DebugLoc &DL = MI.getDebugLoc(); 1766 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); 1767 MI.eraseFromParent(); 1768 return true; 1769 } 1770 } 1771 1772 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait 1773 if (STI.hasSplitBarriers()) { 1774 MachineBasicBlock *MBB = MI.getParent(); 1775 const DebugLoc &DL = MI.getDebugLoc(); 1776 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) 1777 .addImm(AMDGPU::Barrier::WORKGROUP); 1778 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) 1779 .addImm(AMDGPU::Barrier::WORKGROUP); 1780 MI.eraseFromParent(); 1781 return true; 1782 } 1783 1784 return selectImpl(MI, *CoverageInfo); 1785 } 1786 1787 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 1788 bool &IsTexFail) { 1789 if (TexFailCtrl) 1790 IsTexFail = true; 1791 1792 TFE = (TexFailCtrl & 0x1) ? true : false; 1793 TexFailCtrl &= ~(uint64_t)0x1; 1794 LWE = (TexFailCtrl & 0x2) ? true : false; 1795 TexFailCtrl &= ~(uint64_t)0x2; 1796 1797 return TexFailCtrl == 0; 1798 } 1799 1800 bool AMDGPUInstructionSelector::selectImageIntrinsic( 1801 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 1802 MachineBasicBlock *MBB = MI.getParent(); 1803 const DebugLoc &DL = MI.getDebugLoc(); 1804 1805 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 1806 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 1807 1808 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 1809 unsigned IntrOpcode = Intr->BaseOpcode; 1810 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); 1811 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); 1812 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); 1813 1814 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; 1815 1816 Register VDataIn, VDataOut; 1817 LLT VDataTy; 1818 int NumVDataDwords = -1; 1819 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || 1820 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16; 1821 1822 bool Unorm; 1823 if (!BaseOpcode->Sampler) 1824 Unorm = true; 1825 else 1826 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0; 1827 1828 bool TFE; 1829 bool LWE; 1830 bool IsTexFail = false; 1831 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(), 1832 TFE, LWE, IsTexFail)) 1833 return false; 1834 1835 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm(); 1836 const bool IsA16 = (Flags & 1) != 0; 1837 const bool IsG16 = (Flags & 2) != 0; 1838 1839 // A16 implies 16 bit gradients if subtarget doesn't support G16 1840 if (IsA16 && !STI.hasG16() && !IsG16) 1841 return false; 1842 1843 unsigned DMask = 0; 1844 unsigned DMaskLanes = 0; 1845 1846 if (BaseOpcode->Atomic) { 1847 VDataOut = MI.getOperand(0).getReg(); 1848 VDataIn = MI.getOperand(2).getReg(); 1849 LLT Ty = MRI->getType(VDataIn); 1850 1851 // Be careful to allow atomic swap on 16-bit element vectors. 1852 const bool Is64Bit = BaseOpcode->AtomicX2 ? 1853 Ty.getSizeInBits() == 128 : 1854 Ty.getSizeInBits() == 64; 1855 1856 if (BaseOpcode->AtomicX2) { 1857 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 1858 1859 DMask = Is64Bit ? 0xf : 0x3; 1860 NumVDataDwords = Is64Bit ? 4 : 2; 1861 } else { 1862 DMask = Is64Bit ? 0x3 : 0x1; 1863 NumVDataDwords = Is64Bit ? 2 : 1; 1864 } 1865 } else { 1866 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 1867 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); 1868 1869 if (BaseOpcode->Store) { 1870 VDataIn = MI.getOperand(1).getReg(); 1871 VDataTy = MRI->getType(VDataIn); 1872 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 1873 } else if (BaseOpcode->NoReturn) { 1874 NumVDataDwords = 0; 1875 } else { 1876 VDataOut = MI.getOperand(0).getReg(); 1877 VDataTy = MRI->getType(VDataOut); 1878 NumVDataDwords = DMaskLanes; 1879 1880 if (IsD16 && !STI.hasUnpackedD16VMem()) 1881 NumVDataDwords = (DMaskLanes + 1) / 2; 1882 } 1883 } 1884 1885 // Set G16 opcode 1886 if (Subtarget->hasG16() && IsG16) { 1887 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 1888 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 1889 assert(G16MappingInfo); 1890 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 1891 } 1892 1893 // TODO: Check this in verifier. 1894 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 1895 1896 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); 1897 if (BaseOpcode->Atomic) 1898 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 1899 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | 1900 AMDGPU::CPol::VOLATILE)) 1901 return false; 1902 1903 int NumVAddrRegs = 0; 1904 int NumVAddrDwords = 0; 1905 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 1906 // Skip the $noregs and 0s inserted during legalization. 1907 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I); 1908 if (!AddrOp.isReg()) 1909 continue; // XXX - Break? 1910 1911 Register Addr = AddrOp.getReg(); 1912 if (!Addr) 1913 break; 1914 1915 ++NumVAddrRegs; 1916 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 1917 } 1918 1919 // The legalizer preprocessed the intrinsic arguments. If we aren't using 1920 // NSA, these should have been packed into a single value in the first 1921 // address register 1922 const bool UseNSA = 1923 NumVAddrRegs != 1 && 1924 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs 1925 : NumVAddrDwords == NumVAddrRegs); 1926 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 1927 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 1928 return false; 1929 } 1930 1931 if (IsTexFail) 1932 ++NumVDataDwords; 1933 1934 int Opcode = -1; 1935 if (IsGFX12Plus) { 1936 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, 1937 NumVDataDwords, NumVAddrDwords); 1938 } else if (IsGFX11Plus) { 1939 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1940 UseNSA ? AMDGPU::MIMGEncGfx11NSA 1941 : AMDGPU::MIMGEncGfx11Default, 1942 NumVDataDwords, NumVAddrDwords); 1943 } else if (IsGFX10Plus) { 1944 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 1945 UseNSA ? AMDGPU::MIMGEncGfx10NSA 1946 : AMDGPU::MIMGEncGfx10Default, 1947 NumVDataDwords, NumVAddrDwords); 1948 } else { 1949 if (Subtarget->hasGFX90AInsts()) { 1950 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, 1951 NumVDataDwords, NumVAddrDwords); 1952 if (Opcode == -1) { 1953 LLVM_DEBUG( 1954 dbgs() 1955 << "requested image instruction is not supported on this GPU\n"); 1956 return false; 1957 } 1958 } 1959 if (Opcode == -1 && 1960 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 1961 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 1962 NumVDataDwords, NumVAddrDwords); 1963 if (Opcode == -1) 1964 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 1965 NumVDataDwords, NumVAddrDwords); 1966 } 1967 if (Opcode == -1) 1968 return false; 1969 1970 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 1971 .cloneMemRefs(MI); 1972 1973 if (VDataOut) { 1974 if (BaseOpcode->AtomicX2) { 1975 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 1976 1977 Register TmpReg = MRI->createVirtualRegister( 1978 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 1979 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 1980 1981 MIB.addDef(TmpReg); 1982 if (!MRI->use_empty(VDataOut)) { 1983 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 1984 .addReg(TmpReg, RegState::Kill, SubReg); 1985 } 1986 1987 } else { 1988 MIB.addDef(VDataOut); // vdata output 1989 } 1990 } 1991 1992 if (VDataIn) 1993 MIB.addReg(VDataIn); // vdata input 1994 1995 for (int I = 0; I != NumVAddrRegs; ++I) { 1996 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I); 1997 if (SrcOp.isReg()) { 1998 assert(SrcOp.getReg() != 0); 1999 MIB.addReg(SrcOp.getReg()); 2000 } 2001 } 2002 2003 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg()); 2004 if (BaseOpcode->Sampler) 2005 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg()); 2006 2007 MIB.addImm(DMask); // dmask 2008 2009 if (IsGFX10Plus) 2010 MIB.addImm(DimInfo->Encoding); 2011 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm)) 2012 MIB.addImm(Unorm); 2013 2014 MIB.addImm(CPol); 2015 MIB.addImm(IsA16 && // a16 or r128 2016 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 2017 if (IsGFX10Plus) 2018 MIB.addImm(IsA16 ? -1 : 0); 2019 2020 if (!Subtarget->hasGFX90AInsts()) { 2021 MIB.addImm(TFE); // tfe 2022 } else if (TFE) { 2023 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n"); 2024 return false; 2025 } 2026 2027 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe)) 2028 MIB.addImm(LWE); // lwe 2029 if (!IsGFX10Plus) 2030 MIB.addImm(DimInfo->DA ? -1 : 0); 2031 if (BaseOpcode->HasD16) 2032 MIB.addImm(IsD16 ? -1 : 0); 2033 2034 MI.eraseFromParent(); 2035 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2036 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr); 2037 return true; 2038 } 2039 2040 // We need to handle this here because tablegen doesn't support matching 2041 // instructions with multiple outputs. 2042 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( 2043 MachineInstr &MI) const { 2044 Register Dst0 = MI.getOperand(0).getReg(); 2045 Register Dst1 = MI.getOperand(1).getReg(); 2046 2047 const DebugLoc &DL = MI.getDebugLoc(); 2048 MachineBasicBlock *MBB = MI.getParent(); 2049 2050 Register Addr = MI.getOperand(3).getReg(); 2051 Register Data0 = MI.getOperand(4).getReg(); 2052 Register Data1 = MI.getOperand(5).getReg(); 2053 unsigned Offset = MI.getOperand(6).getImm(); 2054 2055 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0) 2056 .addDef(Dst1) 2057 .addUse(Addr) 2058 .addUse(Data0) 2059 .addUse(Data1) 2060 .addImm(Offset) 2061 .cloneMemRefs(MI); 2062 2063 MI.eraseFromParent(); 2064 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2065 } 2066 2067 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 2068 MachineInstr &I) const { 2069 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); 2070 switch (IntrinsicID) { 2071 case Intrinsic::amdgcn_end_cf: 2072 return selectEndCfIntrinsic(I); 2073 case Intrinsic::amdgcn_ds_ordered_add: 2074 case Intrinsic::amdgcn_ds_ordered_swap: 2075 return selectDSOrderedIntrinsic(I, IntrinsicID); 2076 case Intrinsic::amdgcn_ds_gws_init: 2077 case Intrinsic::amdgcn_ds_gws_barrier: 2078 case Intrinsic::amdgcn_ds_gws_sema_v: 2079 case Intrinsic::amdgcn_ds_gws_sema_br: 2080 case Intrinsic::amdgcn_ds_gws_sema_p: 2081 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2082 return selectDSGWSIntrinsic(I, IntrinsicID); 2083 case Intrinsic::amdgcn_ds_append: 2084 return selectDSAppendConsume(I, true); 2085 case Intrinsic::amdgcn_ds_consume: 2086 return selectDSAppendConsume(I, false); 2087 case Intrinsic::amdgcn_s_barrier: 2088 return selectSBarrier(I); 2089 case Intrinsic::amdgcn_raw_buffer_load_lds: 2090 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 2091 case Intrinsic::amdgcn_struct_buffer_load_lds: 2092 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: 2093 return selectBufferLoadLds(I); 2094 case Intrinsic::amdgcn_global_load_lds: 2095 return selectGlobalLoadLds(I); 2096 case Intrinsic::amdgcn_exp_compr: 2097 if (!STI.hasCompressedExport()) { 2098 Function &F = I.getMF()->getFunction(); 2099 DiagnosticInfoUnsupported NoFpRet( 2100 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error); 2101 F.getContext().diagnose(NoFpRet); 2102 return false; 2103 } 2104 break; 2105 case Intrinsic::amdgcn_ds_bvh_stack_rtn: 2106 return selectDSBvhStackIntrinsic(I); 2107 case Intrinsic::amdgcn_s_barrier_init: 2108 case Intrinsic::amdgcn_s_barrier_join: 2109 case Intrinsic::amdgcn_s_wakeup_barrier: 2110 case Intrinsic::amdgcn_s_get_barrier_state: 2111 return selectNamedBarrierInst(I, IntrinsicID); 2112 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 2113 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: 2114 return selectSBarrierSignalIsfirst(I, IntrinsicID); 2115 case Intrinsic::amdgcn_s_barrier_leave: 2116 return selectSBarrierLeave(I); 2117 } 2118 return selectImpl(I, *CoverageInfo); 2119 } 2120 2121 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 2122 if (selectImpl(I, *CoverageInfo)) 2123 return true; 2124 2125 MachineBasicBlock *BB = I.getParent(); 2126 const DebugLoc &DL = I.getDebugLoc(); 2127 2128 Register DstReg = I.getOperand(0).getReg(); 2129 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 2130 assert(Size <= 32 || Size == 64); 2131 const MachineOperand &CCOp = I.getOperand(1); 2132 Register CCReg = CCOp.getReg(); 2133 if (!isVCC(CCReg, *MRI)) { 2134 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 2135 AMDGPU::S_CSELECT_B32; 2136 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 2137 .addReg(CCReg); 2138 2139 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 2140 // bank, because it does not cover the register class that we used to represent 2141 // for it. So we need to manually set the register class here. 2142 if (!MRI->getRegClassOrNull(CCReg)) 2143 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 2144 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 2145 .add(I.getOperand(2)) 2146 .add(I.getOperand(3)); 2147 2148 bool Ret = false; 2149 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 2150 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 2151 I.eraseFromParent(); 2152 return Ret; 2153 } 2154 2155 // Wide VGPR select should have been split in RegBankSelect. 2156 if (Size > 32) 2157 return false; 2158 2159 MachineInstr *Select = 2160 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 2161 .addImm(0) 2162 .add(I.getOperand(3)) 2163 .addImm(0) 2164 .add(I.getOperand(2)) 2165 .add(I.getOperand(1)); 2166 2167 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 2168 I.eraseFromParent(); 2169 return Ret; 2170 } 2171 2172 static int sizeToSubRegIndex(unsigned Size) { 2173 switch (Size) { 2174 case 32: 2175 return AMDGPU::sub0; 2176 case 64: 2177 return AMDGPU::sub0_sub1; 2178 case 96: 2179 return AMDGPU::sub0_sub1_sub2; 2180 case 128: 2181 return AMDGPU::sub0_sub1_sub2_sub3; 2182 case 256: 2183 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; 2184 default: 2185 if (Size < 32) 2186 return AMDGPU::sub0; 2187 if (Size > 256) 2188 return -1; 2189 return sizeToSubRegIndex(llvm::bit_ceil(Size)); 2190 } 2191 } 2192 2193 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 2194 Register DstReg = I.getOperand(0).getReg(); 2195 Register SrcReg = I.getOperand(1).getReg(); 2196 const LLT DstTy = MRI->getType(DstReg); 2197 const LLT SrcTy = MRI->getType(SrcReg); 2198 const LLT S1 = LLT::scalar(1); 2199 2200 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2201 const RegisterBank *DstRB; 2202 if (DstTy == S1) { 2203 // This is a special case. We don't treat s1 for legalization artifacts as 2204 // vcc booleans. 2205 DstRB = SrcRB; 2206 } else { 2207 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2208 if (SrcRB != DstRB) 2209 return false; 2210 } 2211 2212 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2213 2214 unsigned DstSize = DstTy.getSizeInBits(); 2215 unsigned SrcSize = SrcTy.getSizeInBits(); 2216 2217 const TargetRegisterClass *SrcRC = 2218 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB); 2219 const TargetRegisterClass *DstRC = 2220 TRI.getRegClassForSizeOnBank(DstSize, *DstRB); 2221 if (!SrcRC || !DstRC) 2222 return false; 2223 2224 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2225 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 2226 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 2227 return false; 2228 } 2229 2230 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) { 2231 MachineBasicBlock *MBB = I.getParent(); 2232 const DebugLoc &DL = I.getDebugLoc(); 2233 2234 Register LoReg = MRI->createVirtualRegister(DstRC); 2235 Register HiReg = MRI->createVirtualRegister(DstRC); 2236 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 2237 .addReg(SrcReg, 0, AMDGPU::sub0); 2238 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 2239 .addReg(SrcReg, 0, AMDGPU::sub1); 2240 2241 if (IsVALU && STI.hasSDWA()) { 2242 // Write the low 16-bits of the high element into the high 16-bits of the 2243 // low element. 2244 MachineInstr *MovSDWA = 2245 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2246 .addImm(0) // $src0_modifiers 2247 .addReg(HiReg) // $src0 2248 .addImm(0) // $clamp 2249 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2250 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2251 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2252 .addReg(LoReg, RegState::Implicit); 2253 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2254 } else { 2255 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 2256 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 2257 Register ImmReg = MRI->createVirtualRegister(DstRC); 2258 if (IsVALU) { 2259 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 2260 .addImm(16) 2261 .addReg(HiReg); 2262 } else { 2263 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 2264 .addReg(HiReg) 2265 .addImm(16) 2266 .setOperandDead(3); // Dead scc 2267 } 2268 2269 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2270 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2271 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 2272 2273 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 2274 .addImm(0xffff); 2275 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 2276 .addReg(LoReg) 2277 .addReg(ImmReg); 2278 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 2279 .addReg(TmpReg0) 2280 .addReg(TmpReg1); 2281 2282 if (!IsVALU) { 2283 And.setOperandDead(3); // Dead scc 2284 Or.setOperandDead(3); // Dead scc 2285 } 2286 } 2287 2288 I.eraseFromParent(); 2289 return true; 2290 } 2291 2292 if (!DstTy.isScalar()) 2293 return false; 2294 2295 if (SrcSize > 32) { 2296 int SubRegIdx = sizeToSubRegIndex(DstSize); 2297 if (SubRegIdx == -1) 2298 return false; 2299 2300 // Deal with weird cases where the class only partially supports the subreg 2301 // index. 2302 const TargetRegisterClass *SrcWithSubRC 2303 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 2304 if (!SrcWithSubRC) 2305 return false; 2306 2307 if (SrcWithSubRC != SrcRC) { 2308 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 2309 return false; 2310 } 2311 2312 I.getOperand(1).setSubReg(SubRegIdx); 2313 } 2314 2315 I.setDesc(TII.get(TargetOpcode::COPY)); 2316 return true; 2317 } 2318 2319 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 2320 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 2321 Mask = maskTrailingOnes<unsigned>(Size); 2322 int SignedMask = static_cast<int>(Mask); 2323 return SignedMask >= -16 && SignedMask <= 64; 2324 } 2325 2326 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 2327 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 2328 Register Reg, const MachineRegisterInfo &MRI, 2329 const TargetRegisterInfo &TRI) const { 2330 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 2331 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) 2332 return RB; 2333 2334 // Ignore the type, since we don't use vcc in artifacts. 2335 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) 2336 return &RBI.getRegBankFromRegClass(*RC, LLT()); 2337 return nullptr; 2338 } 2339 2340 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 2341 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 2342 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 2343 const DebugLoc &DL = I.getDebugLoc(); 2344 MachineBasicBlock &MBB = *I.getParent(); 2345 const Register DstReg = I.getOperand(0).getReg(); 2346 const Register SrcReg = I.getOperand(1).getReg(); 2347 2348 const LLT DstTy = MRI->getType(DstReg); 2349 const LLT SrcTy = MRI->getType(SrcReg); 2350 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 2351 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 2352 const unsigned DstSize = DstTy.getSizeInBits(); 2353 if (!DstTy.isScalar()) 2354 return false; 2355 2356 // Artifact casts should never use vcc. 2357 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 2358 2359 // FIXME: This should probably be illegal and split earlier. 2360 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 2361 if (DstSize <= 32) 2362 return selectCOPY(I); 2363 2364 const TargetRegisterClass *SrcRC = 2365 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank); 2366 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 2367 const TargetRegisterClass *DstRC = 2368 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 2369 2370 Register UndefReg = MRI->createVirtualRegister(SrcRC); 2371 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2372 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2373 .addReg(SrcReg) 2374 .addImm(AMDGPU::sub0) 2375 .addReg(UndefReg) 2376 .addImm(AMDGPU::sub1); 2377 I.eraseFromParent(); 2378 2379 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 2380 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 2381 } 2382 2383 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 2384 // 64-bit should have been split up in RegBankSelect 2385 2386 // Try to use an and with a mask if it will save code size. 2387 unsigned Mask; 2388 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2389 MachineInstr *ExtI = 2390 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 2391 .addImm(Mask) 2392 .addReg(SrcReg); 2393 I.eraseFromParent(); 2394 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2395 } 2396 2397 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 2398 MachineInstr *ExtI = 2399 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 2400 .addReg(SrcReg) 2401 .addImm(0) // Offset 2402 .addImm(SrcSize); // Width 2403 I.eraseFromParent(); 2404 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2405 } 2406 2407 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 2408 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 2409 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 2410 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 2411 return false; 2412 2413 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 2414 const unsigned SextOpc = SrcSize == 8 ? 2415 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 2416 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 2417 .addReg(SrcReg); 2418 I.eraseFromParent(); 2419 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2420 } 2421 2422 // Using a single 32-bit SALU to calculate the high half is smaller than 2423 // S_BFE with a literal constant operand. 2424 if (DstSize > 32 && SrcSize == 32) { 2425 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2426 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; 2427 if (Signed) { 2428 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) 2429 .addReg(SrcReg, 0, SubReg) 2430 .addImm(31) 2431 .setOperandDead(3); // Dead scc 2432 } else { 2433 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) 2434 .addImm(0); 2435 } 2436 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2437 .addReg(SrcReg, 0, SubReg) 2438 .addImm(AMDGPU::sub0) 2439 .addReg(HiReg) 2440 .addImm(AMDGPU::sub1); 2441 I.eraseFromParent(); 2442 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, 2443 *MRI); 2444 } 2445 2446 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 2447 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 2448 2449 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 2450 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 2451 // We need a 64-bit register source, but the high bits don't matter. 2452 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 2453 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2454 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; 2455 2456 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2457 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 2458 .addReg(SrcReg, 0, SubReg) 2459 .addImm(AMDGPU::sub0) 2460 .addReg(UndefReg) 2461 .addImm(AMDGPU::sub1); 2462 2463 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 2464 .addReg(ExtReg) 2465 .addImm(SrcSize << 16); 2466 2467 I.eraseFromParent(); 2468 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 2469 } 2470 2471 unsigned Mask; 2472 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2473 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 2474 .addReg(SrcReg) 2475 .addImm(Mask) 2476 .setOperandDead(3); // Dead scc 2477 } else { 2478 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 2479 .addReg(SrcReg) 2480 .addImm(SrcSize << 16); 2481 } 2482 2483 I.eraseFromParent(); 2484 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2485 } 2486 2487 return false; 2488 } 2489 2490 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, 2491 Register &Out) { 2492 Register LShlSrc; 2493 if (mi_match(In, MRI, 2494 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) { 2495 Out = LShlSrc; 2496 return true; 2497 } 2498 return false; 2499 } 2500 2501 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const { 2502 if (!Subtarget->hasSALUFloatInsts()) 2503 return false; 2504 2505 Register Dst = I.getOperand(0).getReg(); 2506 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2507 if (DstRB->getID() != AMDGPU::SGPRRegBankID) 2508 return false; 2509 2510 Register Src = I.getOperand(1).getReg(); 2511 2512 if (MRI->getType(Dst) == LLT::scalar(32) && 2513 MRI->getType(Src) == LLT::scalar(16)) { 2514 if (isExtractHiElt(*MRI, Src, Src)) { 2515 MachineBasicBlock *BB = I.getParent(); 2516 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst) 2517 .addUse(Src); 2518 I.eraseFromParent(); 2519 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 2520 } 2521 } 2522 2523 return false; 2524 } 2525 2526 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { 2527 MachineBasicBlock *BB = I.getParent(); 2528 MachineOperand &ImmOp = I.getOperand(1); 2529 Register DstReg = I.getOperand(0).getReg(); 2530 unsigned Size = MRI->getType(DstReg).getSizeInBits(); 2531 bool IsFP = false; 2532 2533 // The AMDGPU backend only supports Imm operands and not CImm or FPImm. 2534 if (ImmOp.isFPImm()) { 2535 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); 2536 ImmOp.ChangeToImmediate(Imm.getZExtValue()); 2537 IsFP = true; 2538 } else if (ImmOp.isCImm()) { 2539 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue()); 2540 } else { 2541 llvm_unreachable("Not supported by g_constants"); 2542 } 2543 2544 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2545 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID; 2546 2547 unsigned Opcode; 2548 if (DstRB->getID() == AMDGPU::VCCRegBankID) { 2549 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 2550 } else if (Size == 64 && 2551 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) { 2552 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO; 2553 I.setDesc(TII.get(Opcode)); 2554 I.addImplicitDefUseOperands(*MF); 2555 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2556 } else { 2557 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 2558 2559 // We should never produce s1 values on banks other than VCC. If the user of 2560 // this already constrained the register, we may incorrectly think it's VCC 2561 // if it wasn't originally. 2562 if (Size == 1) 2563 return false; 2564 } 2565 2566 if (Size != 64) { 2567 I.setDesc(TII.get(Opcode)); 2568 I.addImplicitDefUseOperands(*MF); 2569 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 2570 } 2571 2572 const DebugLoc &DL = I.getDebugLoc(); 2573 2574 APInt Imm(Size, I.getOperand(1).getImm()); 2575 2576 MachineInstr *ResInst; 2577 if (IsSgpr && TII.isInlineConstant(Imm)) { 2578 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 2579 .addImm(I.getOperand(1).getImm()); 2580 } else { 2581 const TargetRegisterClass *RC = IsSgpr ? 2582 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass; 2583 Register LoReg = MRI->createVirtualRegister(RC); 2584 Register HiReg = MRI->createVirtualRegister(RC); 2585 2586 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) 2587 .addImm(Imm.trunc(32).getZExtValue()); 2588 2589 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) 2590 .addImm(Imm.ashr(32).getZExtValue()); 2591 2592 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2593 .addReg(LoReg) 2594 .addImm(AMDGPU::sub0) 2595 .addReg(HiReg) 2596 .addImm(AMDGPU::sub1); 2597 } 2598 2599 // We can't call constrainSelectedInstRegOperands here, because it doesn't 2600 // work for target independent opcodes 2601 I.eraseFromParent(); 2602 const TargetRegisterClass *DstRC = 2603 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI); 2604 if (!DstRC) 2605 return true; 2606 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI); 2607 } 2608 2609 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2610 // Only manually handle the f64 SGPR case. 2611 // 2612 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2613 // the bit ops theoretically have a second result due to the implicit def of 2614 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2615 // that is easy by disabling the check. The result works, but uses a 2616 // nonsensical sreg32orlds_and_sreg_1 regclass. 2617 // 2618 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2619 // the variadic REG_SEQUENCE operands. 2620 2621 Register Dst = MI.getOperand(0).getReg(); 2622 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2623 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2624 MRI->getType(Dst) != LLT::scalar(64)) 2625 return false; 2626 2627 Register Src = MI.getOperand(1).getReg(); 2628 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2629 if (Fabs) 2630 Src = Fabs->getOperand(1).getReg(); 2631 2632 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2633 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2634 return false; 2635 2636 MachineBasicBlock *BB = MI.getParent(); 2637 const DebugLoc &DL = MI.getDebugLoc(); 2638 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2639 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2640 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2641 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2642 2643 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2644 .addReg(Src, 0, AMDGPU::sub0); 2645 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2646 .addReg(Src, 0, AMDGPU::sub1); 2647 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2648 .addImm(0x80000000); 2649 2650 // Set or toggle sign bit. 2651 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2652 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2653 .addReg(HiReg) 2654 .addReg(ConstReg) 2655 .setOperandDead(3); // Dead scc 2656 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2657 .addReg(LoReg) 2658 .addImm(AMDGPU::sub0) 2659 .addReg(OpReg) 2660 .addImm(AMDGPU::sub1); 2661 MI.eraseFromParent(); 2662 return true; 2663 } 2664 2665 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2666 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2667 Register Dst = MI.getOperand(0).getReg(); 2668 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2669 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2670 MRI->getType(Dst) != LLT::scalar(64)) 2671 return false; 2672 2673 Register Src = MI.getOperand(1).getReg(); 2674 MachineBasicBlock *BB = MI.getParent(); 2675 const DebugLoc &DL = MI.getDebugLoc(); 2676 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2677 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2678 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2679 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2680 2681 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2682 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2683 return false; 2684 2685 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2686 .addReg(Src, 0, AMDGPU::sub0); 2687 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2688 .addReg(Src, 0, AMDGPU::sub1); 2689 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2690 .addImm(0x7fffffff); 2691 2692 // Clear sign bit. 2693 // TODO: Should this used S_BITSET0_*? 2694 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2695 .addReg(HiReg) 2696 .addReg(ConstReg) 2697 .setOperandDead(3); // Dead scc 2698 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2699 .addReg(LoReg) 2700 .addImm(AMDGPU::sub0) 2701 .addReg(OpReg) 2702 .addImm(AMDGPU::sub1); 2703 2704 MI.eraseFromParent(); 2705 return true; 2706 } 2707 2708 static bool isConstant(const MachineInstr &MI) { 2709 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2710 } 2711 2712 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2713 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2714 2715 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1; 2716 const MachineInstr *PtrMI = 2717 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg()); 2718 2719 assert(PtrMI); 2720 2721 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2722 return; 2723 2724 GEPInfo GEPInfo; 2725 2726 for (unsigned i = 1; i != 3; ++i) { 2727 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2728 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2729 assert(OpDef); 2730 if (i == 2 && isConstant(*OpDef)) { 2731 // TODO: Could handle constant base + variable offset, but a combine 2732 // probably should have commuted it. 2733 assert(GEPInfo.Imm == 0); 2734 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2735 continue; 2736 } 2737 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2738 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2739 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2740 else 2741 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2742 } 2743 2744 AddrInfo.push_back(GEPInfo); 2745 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2746 } 2747 2748 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { 2749 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; 2750 } 2751 2752 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2753 if (!MI.hasOneMemOperand()) 2754 return false; 2755 2756 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2757 const Value *Ptr = MMO->getValue(); 2758 2759 // UndefValue means this is a load of a kernel input. These are uniform. 2760 // Sometimes LDS instructions have constant pointers. 2761 // If Ptr is null, then that means this mem operand contains a 2762 // PseudoSourceValue like GOT. 2763 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || 2764 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) 2765 return true; 2766 2767 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2768 return true; 2769 2770 if (MI.getOpcode() == AMDGPU::G_PREFETCH) 2771 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() == 2772 AMDGPU::SGPRRegBankID; 2773 2774 const Instruction *I = dyn_cast<Instruction>(Ptr); 2775 return I && I->getMetadata("amdgpu.uniform"); 2776 } 2777 2778 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2779 for (const GEPInfo &GEPInfo : AddrInfo) { 2780 if (!GEPInfo.VgprParts.empty()) 2781 return true; 2782 } 2783 return false; 2784 } 2785 2786 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2787 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2788 unsigned AS = PtrTy.getAddressSpace(); 2789 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2790 STI.ldsRequiresM0Init()) { 2791 MachineBasicBlock *BB = I.getParent(); 2792 2793 // If DS instructions require M0 initialization, insert it before selecting. 2794 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2795 .addImm(-1); 2796 } 2797 } 2798 2799 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( 2800 MachineInstr &I) const { 2801 initM0(I); 2802 return selectImpl(I, *CoverageInfo); 2803 } 2804 2805 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { 2806 if (Reg.isPhysical()) 2807 return false; 2808 2809 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg); 2810 const unsigned Opcode = MI.getOpcode(); 2811 2812 if (Opcode == AMDGPU::COPY) 2813 return isVCmpResult(MI.getOperand(1).getReg(), MRI); 2814 2815 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR || 2816 Opcode == AMDGPU::G_XOR) 2817 return isVCmpResult(MI.getOperand(1).getReg(), MRI) && 2818 isVCmpResult(MI.getOperand(2).getReg(), MRI); 2819 2820 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) 2821 return GI->is(Intrinsic::amdgcn_class); 2822 2823 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP; 2824 } 2825 2826 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 2827 MachineBasicBlock *BB = I.getParent(); 2828 MachineOperand &CondOp = I.getOperand(0); 2829 Register CondReg = CondOp.getReg(); 2830 const DebugLoc &DL = I.getDebugLoc(); 2831 2832 unsigned BrOpcode; 2833 Register CondPhysReg; 2834 const TargetRegisterClass *ConstrainRC; 2835 2836 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2837 // whether the branch is uniform when selecting the instruction. In 2838 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 2839 // RegBankSelect knows what it's doing if the branch condition is scc, even 2840 // though it currently does not. 2841 if (!isVCC(CondReg, *MRI)) { 2842 if (MRI->getType(CondReg) != LLT::scalar(32)) 2843 return false; 2844 2845 CondPhysReg = AMDGPU::SCC; 2846 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 2847 ConstrainRC = &AMDGPU::SReg_32RegClass; 2848 } else { 2849 // FIXME: Should scc->vcc copies and with exec? 2850 2851 // Unless the value of CondReg is a result of a V_CMP* instruction then we 2852 // need to insert an and with exec. 2853 if (!isVCmpResult(CondReg, *MRI)) { 2854 const bool Is64 = STI.isWave64(); 2855 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 2856 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 2857 2858 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC()); 2859 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg) 2860 .addReg(CondReg) 2861 .addReg(Exec) 2862 .setOperandDead(3); // Dead scc 2863 CondReg = TmpReg; 2864 } 2865 2866 CondPhysReg = TRI.getVCC(); 2867 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 2868 ConstrainRC = TRI.getBoolRC(); 2869 } 2870 2871 if (!MRI->getRegClassOrNull(CondReg)) 2872 MRI->setRegClass(CondReg, ConstrainRC); 2873 2874 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 2875 .addReg(CondReg); 2876 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 2877 .addMBB(I.getOperand(1).getMBB()); 2878 2879 I.eraseFromParent(); 2880 return true; 2881 } 2882 2883 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 2884 MachineInstr &I) const { 2885 Register DstReg = I.getOperand(0).getReg(); 2886 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2887 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2888 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 2889 if (IsVGPR) 2890 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 2891 2892 return RBI.constrainGenericRegister( 2893 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 2894 } 2895 2896 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 2897 Register DstReg = I.getOperand(0).getReg(); 2898 Register SrcReg = I.getOperand(1).getReg(); 2899 Register MaskReg = I.getOperand(2).getReg(); 2900 LLT Ty = MRI->getType(DstReg); 2901 LLT MaskTy = MRI->getType(MaskReg); 2902 MachineBasicBlock *BB = I.getParent(); 2903 const DebugLoc &DL = I.getDebugLoc(); 2904 2905 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2906 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2907 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 2908 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 2909 if (DstRB != SrcRB) // Should only happen for hand written MIR. 2910 return false; 2911 2912 // Try to avoid emitting a bit operation when we only need to touch half of 2913 // the 64-bit pointer. 2914 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64); 2915 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 2916 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 2917 2918 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32; 2919 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32; 2920 2921 if (!IsVGPR && Ty.getSizeInBits() == 64 && 2922 !CanCopyLow32 && !CanCopyHi32) { 2923 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) 2924 .addReg(SrcReg) 2925 .addReg(MaskReg) 2926 .setOperandDead(3); // Dead scc 2927 I.eraseFromParent(); 2928 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2929 } 2930 2931 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2932 const TargetRegisterClass &RegRC 2933 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 2934 2935 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB); 2936 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB); 2937 const TargetRegisterClass *MaskRC = 2938 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB); 2939 2940 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 2941 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2942 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 2943 return false; 2944 2945 if (Ty.getSizeInBits() == 32) { 2946 assert(MaskTy.getSizeInBits() == 32 && 2947 "ptrmask should have been narrowed during legalize"); 2948 2949 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 2950 .addReg(SrcReg) 2951 .addReg(MaskReg); 2952 2953 if (!IsVGPR) 2954 NewOp.setOperandDead(3); // Dead scc 2955 I.eraseFromParent(); 2956 return true; 2957 } 2958 2959 Register HiReg = MRI->createVirtualRegister(&RegRC); 2960 Register LoReg = MRI->createVirtualRegister(&RegRC); 2961 2962 // Extract the subregisters from the source pointer. 2963 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 2964 .addReg(SrcReg, 0, AMDGPU::sub0); 2965 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 2966 .addReg(SrcReg, 0, AMDGPU::sub1); 2967 2968 Register MaskedLo, MaskedHi; 2969 2970 if (CanCopyLow32) { 2971 // If all the bits in the low half are 1, we only need a copy for it. 2972 MaskedLo = LoReg; 2973 } else { 2974 // Extract the mask subregister and apply the and. 2975 Register MaskLo = MRI->createVirtualRegister(&RegRC); 2976 MaskedLo = MRI->createVirtualRegister(&RegRC); 2977 2978 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 2979 .addReg(MaskReg, 0, AMDGPU::sub0); 2980 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 2981 .addReg(LoReg) 2982 .addReg(MaskLo); 2983 } 2984 2985 if (CanCopyHi32) { 2986 // If all the bits in the high half are 1, we only need a copy for it. 2987 MaskedHi = HiReg; 2988 } else { 2989 Register MaskHi = MRI->createVirtualRegister(&RegRC); 2990 MaskedHi = MRI->createVirtualRegister(&RegRC); 2991 2992 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 2993 .addReg(MaskReg, 0, AMDGPU::sub1); 2994 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 2995 .addReg(HiReg) 2996 .addReg(MaskHi); 2997 } 2998 2999 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 3000 .addReg(MaskedLo) 3001 .addImm(AMDGPU::sub0) 3002 .addReg(MaskedHi) 3003 .addImm(AMDGPU::sub1); 3004 I.eraseFromParent(); 3005 return true; 3006 } 3007 3008 /// Return the register to use for the index value, and the subregister to use 3009 /// for the indirectly accessed register. 3010 static std::pair<Register, unsigned> 3011 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, 3012 const TargetRegisterClass *SuperRC, Register IdxReg, 3013 unsigned EltSize, GISelKnownBits &KnownBits) { 3014 Register IdxBaseReg; 3015 int Offset; 3016 3017 std::tie(IdxBaseReg, Offset) = 3018 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits); 3019 if (IdxBaseReg == AMDGPU::NoRegister) { 3020 // This will happen if the index is a known constant. This should ordinarily 3021 // be legalized out, but handle it as a register just in case. 3022 assert(Offset == 0); 3023 IdxBaseReg = IdxReg; 3024 } 3025 3026 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 3027 3028 // Skip out of bounds offsets, or else we would end up using an undefined 3029 // register. 3030 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 3031 return std::pair(IdxReg, SubRegs[0]); 3032 return std::pair(IdxBaseReg, SubRegs[Offset]); 3033 } 3034 3035 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 3036 MachineInstr &MI) const { 3037 Register DstReg = MI.getOperand(0).getReg(); 3038 Register SrcReg = MI.getOperand(1).getReg(); 3039 Register IdxReg = MI.getOperand(2).getReg(); 3040 3041 LLT DstTy = MRI->getType(DstReg); 3042 LLT SrcTy = MRI->getType(SrcReg); 3043 3044 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3045 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 3046 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 3047 3048 // The index must be scalar. If it wasn't RegBankSelect should have moved this 3049 // into a waterfall loop. 3050 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 3051 return false; 3052 3053 const TargetRegisterClass *SrcRC = 3054 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB); 3055 const TargetRegisterClass *DstRC = 3056 TRI.getRegClassForTypeOnBank(DstTy, *DstRB); 3057 if (!SrcRC || !DstRC) 3058 return false; 3059 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 3060 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 3061 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 3062 return false; 3063 3064 MachineBasicBlock *BB = MI.getParent(); 3065 const DebugLoc &DL = MI.getDebugLoc(); 3066 const bool Is64 = DstTy.getSizeInBits() == 64; 3067 3068 unsigned SubReg; 3069 std::tie(IdxReg, SubReg) = computeIndirectRegIndex( 3070 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB); 3071 3072 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 3073 if (DstTy.getSizeInBits() != 32 && !Is64) 3074 return false; 3075 3076 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3077 .addReg(IdxReg); 3078 3079 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 3080 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 3081 .addReg(SrcReg, 0, SubReg) 3082 .addReg(SrcReg, RegState::Implicit); 3083 MI.eraseFromParent(); 3084 return true; 3085 } 3086 3087 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 3088 return false; 3089 3090 if (!STI.useVGPRIndexMode()) { 3091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3092 .addReg(IdxReg); 3093 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 3094 .addReg(SrcReg, 0, SubReg) 3095 .addReg(SrcReg, RegState::Implicit); 3096 MI.eraseFromParent(); 3097 return true; 3098 } 3099 3100 const MCInstrDesc &GPRIDXDesc = 3101 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true); 3102 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 3103 .addReg(SrcReg) 3104 .addReg(IdxReg) 3105 .addImm(SubReg); 3106 3107 MI.eraseFromParent(); 3108 return true; 3109 } 3110 3111 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 3112 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 3113 MachineInstr &MI) const { 3114 Register DstReg = MI.getOperand(0).getReg(); 3115 Register VecReg = MI.getOperand(1).getReg(); 3116 Register ValReg = MI.getOperand(2).getReg(); 3117 Register IdxReg = MI.getOperand(3).getReg(); 3118 3119 LLT VecTy = MRI->getType(DstReg); 3120 LLT ValTy = MRI->getType(ValReg); 3121 unsigned VecSize = VecTy.getSizeInBits(); 3122 unsigned ValSize = ValTy.getSizeInBits(); 3123 3124 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 3125 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 3126 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 3127 3128 assert(VecTy.getElementType() == ValTy); 3129 3130 // The index must be scalar. If it wasn't RegBankSelect should have moved this 3131 // into a waterfall loop. 3132 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 3133 return false; 3134 3135 const TargetRegisterClass *VecRC = 3136 TRI.getRegClassForTypeOnBank(VecTy, *VecRB); 3137 const TargetRegisterClass *ValRC = 3138 TRI.getRegClassForTypeOnBank(ValTy, *ValRB); 3139 3140 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 3141 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 3142 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 3143 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 3144 return false; 3145 3146 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 3147 return false; 3148 3149 unsigned SubReg; 3150 std::tie(IdxReg, SubReg) = 3151 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB); 3152 3153 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 3154 STI.useVGPRIndexMode(); 3155 3156 MachineBasicBlock *BB = MI.getParent(); 3157 const DebugLoc &DL = MI.getDebugLoc(); 3158 3159 if (!IndexMode) { 3160 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3161 .addReg(IdxReg); 3162 3163 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo( 3164 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID); 3165 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 3166 .addReg(VecReg) 3167 .addReg(ValReg) 3168 .addImm(SubReg); 3169 MI.eraseFromParent(); 3170 return true; 3171 } 3172 3173 const MCInstrDesc &GPRIDXDesc = 3174 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 3175 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 3176 .addReg(VecReg) 3177 .addReg(ValReg) 3178 .addReg(IdxReg) 3179 .addImm(SubReg); 3180 3181 MI.eraseFromParent(); 3182 return true; 3183 } 3184 3185 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { 3186 assert(!AMDGPU::isGFX12Plus(STI)); 3187 unsigned Opc; 3188 unsigned Size = MI.getOperand(3).getImm(); 3189 3190 // The struct intrinsic variants add one additional operand over raw. 3191 const bool HasVIndex = MI.getNumOperands() == 9; 3192 Register VIndex; 3193 int OpOffset = 0; 3194 if (HasVIndex) { 3195 VIndex = MI.getOperand(4).getReg(); 3196 OpOffset = 1; 3197 } 3198 3199 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3200 std::optional<ValueAndVReg> MaybeVOffset = 3201 getIConstantVRegValWithLookThrough(VOffset, *MRI); 3202 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); 3203 3204 switch (Size) { 3205 default: 3206 return false; 3207 case 1: 3208 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN 3209 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN 3210 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN 3211 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; 3212 break; 3213 case 2: 3214 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN 3215 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN 3216 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN 3217 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; 3218 break; 3219 case 4: 3220 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN 3221 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN 3222 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN 3223 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; 3224 break; 3225 } 3226 3227 MachineBasicBlock *MBB = MI.getParent(); 3228 const DebugLoc &DL = MI.getDebugLoc(); 3229 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3230 .add(MI.getOperand(2)); 3231 3232 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); 3233 3234 if (HasVIndex && HasVOffset) { 3235 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); 3236 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) 3237 .addReg(VIndex) 3238 .addImm(AMDGPU::sub0) 3239 .addReg(VOffset) 3240 .addImm(AMDGPU::sub1); 3241 3242 MIB.addReg(IdxReg); 3243 } else if (HasVIndex) { 3244 MIB.addReg(VIndex); 3245 } else if (HasVOffset) { 3246 MIB.addReg(VOffset); 3247 } 3248 3249 MIB.add(MI.getOperand(1)); // rsrc 3250 MIB.add(MI.getOperand(5 + OpOffset)); // soffset 3251 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset 3252 unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); 3253 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol 3254 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz 3255 3256 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3257 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3258 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); 3259 MachinePointerInfo StorePtrI = LoadPtrI; 3260 StorePtrI.V = nullptr; 3261 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3262 3263 auto F = LoadMMO->getFlags() & 3264 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3265 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3266 Size, LoadMMO->getBaseAlign()); 3267 3268 MachineMemOperand *StoreMMO = 3269 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3270 sizeof(int32_t), LoadMMO->getBaseAlign()); 3271 3272 MIB.setMemRefs({LoadMMO, StoreMMO}); 3273 3274 MI.eraseFromParent(); 3275 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3276 } 3277 3278 /// Match a zero extend from a 32-bit value to 64-bits. 3279 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { 3280 Register ZExtSrc; 3281 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) 3282 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); 3283 3284 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) 3285 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 3286 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) 3287 return Register(); 3288 3289 assert(Def->getNumOperands() == 3 && 3290 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); 3291 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { 3292 return Def->getOperand(1).getReg(); 3293 } 3294 3295 return Register(); 3296 } 3297 3298 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ 3299 unsigned Opc; 3300 unsigned Size = MI.getOperand(3).getImm(); 3301 3302 switch (Size) { 3303 default: 3304 return false; 3305 case 1: 3306 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; 3307 break; 3308 case 2: 3309 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; 3310 break; 3311 case 4: 3312 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; 3313 break; 3314 } 3315 3316 MachineBasicBlock *MBB = MI.getParent(); 3317 const DebugLoc &DL = MI.getDebugLoc(); 3318 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3319 .add(MI.getOperand(2)); 3320 3321 Register Addr = MI.getOperand(1).getReg(); 3322 Register VOffset; 3323 // Try to split SAddr and VOffset. Global and LDS pointers share the same 3324 // immediate offset, so we cannot use a regular SelectGlobalSAddr(). 3325 if (!isSGPR(Addr)) { 3326 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3327 if (isSGPR(AddrDef->Reg)) { 3328 Addr = AddrDef->Reg; 3329 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3330 Register SAddr = 3331 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 3332 if (isSGPR(SAddr)) { 3333 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 3334 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 3335 Addr = SAddr; 3336 VOffset = Off; 3337 } 3338 } 3339 } 3340 } 3341 3342 if (isSGPR(Addr)) { 3343 Opc = AMDGPU::getGlobalSaddrOp(Opc); 3344 if (!VOffset) { 3345 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3346 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 3347 .addImm(0); 3348 } 3349 } 3350 3351 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) 3352 .addReg(Addr); 3353 3354 if (isSGPR(Addr)) 3355 MIB.addReg(VOffset); 3356 3357 MIB.add(MI.getOperand(4)) // offset 3358 .add(MI.getOperand(5)); // cpol 3359 3360 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3361 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3362 LoadPtrI.Offset = MI.getOperand(4).getImm(); 3363 MachinePointerInfo StorePtrI = LoadPtrI; 3364 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 3365 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3366 auto F = LoadMMO->getFlags() & 3367 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3368 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3369 Size, LoadMMO->getBaseAlign()); 3370 MachineMemOperand *StoreMMO = 3371 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3372 sizeof(int32_t), Align(4)); 3373 3374 MIB.setMemRefs({LoadMMO, StoreMMO}); 3375 3376 MI.eraseFromParent(); 3377 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3378 } 3379 3380 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ 3381 MI.setDesc(TII.get(MI.getOperand(1).getImm())); 3382 MI.removeOperand(1); 3383 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3384 return true; 3385 } 3386 3387 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { 3388 unsigned Opc; 3389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 3390 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 3391 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; 3392 break; 3393 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 3394 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; 3395 break; 3396 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 3397 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; 3398 break; 3399 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 3400 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; 3401 break; 3402 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 3403 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; 3404 break; 3405 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 3406 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; 3407 break; 3408 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 3409 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64; 3410 break; 3411 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 3412 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64; 3413 break; 3414 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 3415 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64; 3416 break; 3417 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 3418 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64; 3419 break; 3420 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 3421 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64; 3422 break; 3423 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 3424 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64; 3425 break; 3426 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 3427 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64; 3428 break; 3429 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: 3430 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64; 3431 break; 3432 default: 3433 llvm_unreachable("unhandled smfmac intrinsic"); 3434 } 3435 3436 auto VDst_In = MI.getOperand(4); 3437 3438 MI.setDesc(TII.get(Opc)); 3439 MI.removeOperand(4); // VDst_In 3440 MI.removeOperand(1); // Intrinsic ID 3441 MI.addOperand(VDst_In); // Readd VDst_In to the end 3442 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3443 return true; 3444 } 3445 3446 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { 3447 Register DstReg = MI.getOperand(0).getReg(); 3448 Register SrcReg = MI.getOperand(1).getReg(); 3449 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3450 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 3451 MachineBasicBlock *MBB = MI.getParent(); 3452 const DebugLoc &DL = MI.getDebugLoc(); 3453 3454 if (IsVALU) { 3455 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 3456 .addImm(Subtarget->getWavefrontSizeLog2()) 3457 .addReg(SrcReg); 3458 } else { 3459 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 3460 .addReg(SrcReg) 3461 .addImm(Subtarget->getWavefrontSizeLog2()) 3462 .setOperandDead(3); // Dead scc 3463 } 3464 3465 const TargetRegisterClass &RC = 3466 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 3467 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 3468 return false; 3469 3470 MI.eraseFromParent(); 3471 return true; 3472 } 3473 3474 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const { 3475 Register SrcReg = MI.getOperand(0).getReg(); 3476 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 3477 return false; 3478 3479 MachineInstr *DefMI = MRI->getVRegDef(SrcReg); 3480 Register SP = 3481 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore(); 3482 Register WaveAddr = getWaveAddress(DefMI); 3483 MachineBasicBlock *MBB = MI.getParent(); 3484 const DebugLoc &DL = MI.getDebugLoc(); 3485 3486 if (!WaveAddr) { 3487 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3488 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr) 3489 .addReg(SrcReg) 3490 .addImm(Subtarget->getWavefrontSizeLog2()) 3491 .setOperandDead(3); // Dead scc 3492 } 3493 3494 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP) 3495 .addReg(WaveAddr); 3496 3497 MI.eraseFromParent(); 3498 return true; 3499 } 3500 3501 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 3502 3503 if (!I.isPreISelOpcode()) { 3504 if (I.isCopy()) 3505 return selectCOPY(I); 3506 return true; 3507 } 3508 3509 switch (I.getOpcode()) { 3510 case TargetOpcode::G_AND: 3511 case TargetOpcode::G_OR: 3512 case TargetOpcode::G_XOR: 3513 if (selectImpl(I, *CoverageInfo)) 3514 return true; 3515 return selectG_AND_OR_XOR(I); 3516 case TargetOpcode::G_ADD: 3517 case TargetOpcode::G_SUB: 3518 case TargetOpcode::G_PTR_ADD: 3519 if (selectImpl(I, *CoverageInfo)) 3520 return true; 3521 return selectG_ADD_SUB(I); 3522 case TargetOpcode::G_UADDO: 3523 case TargetOpcode::G_USUBO: 3524 case TargetOpcode::G_UADDE: 3525 case TargetOpcode::G_USUBE: 3526 return selectG_UADDO_USUBO_UADDE_USUBE(I); 3527 case AMDGPU::G_AMDGPU_MAD_U64_U32: 3528 case AMDGPU::G_AMDGPU_MAD_I64_I32: 3529 return selectG_AMDGPU_MAD_64_32(I); 3530 case TargetOpcode::G_INTTOPTR: 3531 case TargetOpcode::G_BITCAST: 3532 case TargetOpcode::G_PTRTOINT: 3533 case TargetOpcode::G_FREEZE: 3534 return selectCOPY(I); 3535 case TargetOpcode::G_CONSTANT: 3536 case TargetOpcode::G_FCONSTANT: 3537 return selectG_CONSTANT(I); 3538 case TargetOpcode::G_FNEG: 3539 if (selectImpl(I, *CoverageInfo)) 3540 return true; 3541 return selectG_FNEG(I); 3542 case TargetOpcode::G_FABS: 3543 if (selectImpl(I, *CoverageInfo)) 3544 return true; 3545 return selectG_FABS(I); 3546 case TargetOpcode::G_EXTRACT: 3547 return selectG_EXTRACT(I); 3548 case TargetOpcode::G_MERGE_VALUES: 3549 case TargetOpcode::G_CONCAT_VECTORS: 3550 return selectG_MERGE_VALUES(I); 3551 case TargetOpcode::G_UNMERGE_VALUES: 3552 return selectG_UNMERGE_VALUES(I); 3553 case TargetOpcode::G_BUILD_VECTOR: 3554 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 3555 return selectG_BUILD_VECTOR(I); 3556 case TargetOpcode::G_IMPLICIT_DEF: 3557 return selectG_IMPLICIT_DEF(I); 3558 case TargetOpcode::G_INSERT: 3559 return selectG_INSERT(I); 3560 case TargetOpcode::G_INTRINSIC: 3561 case TargetOpcode::G_INTRINSIC_CONVERGENT: 3562 return selectG_INTRINSIC(I); 3563 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 3564 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: 3565 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 3566 case TargetOpcode::G_ICMP: 3567 case TargetOpcode::G_FCMP: 3568 if (selectG_ICMP_or_FCMP(I)) 3569 return true; 3570 return selectImpl(I, *CoverageInfo); 3571 case TargetOpcode::G_LOAD: 3572 case TargetOpcode::G_STORE: 3573 case TargetOpcode::G_ATOMIC_CMPXCHG: 3574 case TargetOpcode::G_ATOMICRMW_XCHG: 3575 case TargetOpcode::G_ATOMICRMW_ADD: 3576 case TargetOpcode::G_ATOMICRMW_SUB: 3577 case TargetOpcode::G_ATOMICRMW_AND: 3578 case TargetOpcode::G_ATOMICRMW_OR: 3579 case TargetOpcode::G_ATOMICRMW_XOR: 3580 case TargetOpcode::G_ATOMICRMW_MIN: 3581 case TargetOpcode::G_ATOMICRMW_MAX: 3582 case TargetOpcode::G_ATOMICRMW_UMIN: 3583 case TargetOpcode::G_ATOMICRMW_UMAX: 3584 case TargetOpcode::G_ATOMICRMW_UINC_WRAP: 3585 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: 3586 case TargetOpcode::G_ATOMICRMW_FADD: 3587 case TargetOpcode::G_ATOMICRMW_FMIN: 3588 case TargetOpcode::G_ATOMICRMW_FMAX: 3589 return selectG_LOAD_STORE_ATOMICRMW(I); 3590 case TargetOpcode::G_SELECT: 3591 return selectG_SELECT(I); 3592 case TargetOpcode::G_TRUNC: 3593 return selectG_TRUNC(I); 3594 case TargetOpcode::G_SEXT: 3595 case TargetOpcode::G_ZEXT: 3596 case TargetOpcode::G_ANYEXT: 3597 case TargetOpcode::G_SEXT_INREG: 3598 // This is a workaround. For extension from type i1, `selectImpl()` uses 3599 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type 3600 // i1 can only be hold in a SGPR class. 3601 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) && 3602 selectImpl(I, *CoverageInfo)) 3603 return true; 3604 return selectG_SZA_EXT(I); 3605 case TargetOpcode::G_FPEXT: 3606 if (selectG_FPEXT(I)) 3607 return true; 3608 return selectImpl(I, *CoverageInfo); 3609 case TargetOpcode::G_BRCOND: 3610 return selectG_BRCOND(I); 3611 case TargetOpcode::G_GLOBAL_VALUE: 3612 return selectG_GLOBAL_VALUE(I); 3613 case TargetOpcode::G_PTRMASK: 3614 return selectG_PTRMASK(I); 3615 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 3616 return selectG_EXTRACT_VECTOR_ELT(I); 3617 case TargetOpcode::G_INSERT_VECTOR_ELT: 3618 return selectG_INSERT_VECTOR_ELT(I); 3619 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 3620 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 3621 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET: 3622 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 3623 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 3624 const AMDGPU::ImageDimIntrinsicInfo *Intr = 3625 AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I)); 3626 assert(Intr && "not an image intrinsic with image pseudo"); 3627 return selectImageIntrinsic(I, Intr); 3628 } 3629 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: 3630 return selectBVHIntrinsic(I); 3631 case AMDGPU::G_SBFX: 3632 case AMDGPU::G_UBFX: 3633 return selectG_SBFX_UBFX(I); 3634 case AMDGPU::G_SI_CALL: 3635 I.setDesc(TII.get(AMDGPU::SI_CALL)); 3636 return true; 3637 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: 3638 return selectWaveAddress(I); 3639 case AMDGPU::G_STACKRESTORE: 3640 return selectStackRestore(I); 3641 case AMDGPU::G_PHI: 3642 return selectPHI(I); 3643 default: 3644 return selectImpl(I, *CoverageInfo); 3645 } 3646 return false; 3647 } 3648 3649 InstructionSelector::ComplexRendererFns 3650 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 3651 return {{ 3652 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3653 }}; 3654 3655 } 3656 3657 std::pair<Register, unsigned> 3658 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, 3659 bool IsCanonicalizing, 3660 bool AllowAbs, bool OpSel) const { 3661 Register Src = Root.getReg(); 3662 unsigned Mods = 0; 3663 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 3664 3665 if (MI->getOpcode() == AMDGPU::G_FNEG) { 3666 Src = MI->getOperand(1).getReg(); 3667 Mods |= SISrcMods::NEG; 3668 MI = getDefIgnoringCopies(Src, *MRI); 3669 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) { 3670 // Fold fsub [+-]0 into fneg. This may not have folded depending on the 3671 // denormal mode, but we're implicitly canonicalizing in a source operand. 3672 const ConstantFP *LHS = 3673 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI); 3674 if (LHS && LHS->isZero()) { 3675 Mods |= SISrcMods::NEG; 3676 Src = MI->getOperand(2).getReg(); 3677 } 3678 } 3679 3680 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) { 3681 Src = MI->getOperand(1).getReg(); 3682 Mods |= SISrcMods::ABS; 3683 } 3684 3685 if (OpSel) 3686 Mods |= SISrcMods::OP_SEL_0; 3687 3688 return std::pair(Src, Mods); 3689 } 3690 3691 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded( 3692 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, 3693 bool ForceVGPR) const { 3694 if ((Mods != 0 || ForceVGPR) && 3695 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 3696 3697 // If we looked through copies to find source modifiers on an SGPR operand, 3698 // we now have an SGPR register source. To avoid potentially violating the 3699 // constant bus restriction, we need to insert a copy to a VGPR. 3700 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg()); 3701 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(), 3702 TII.get(AMDGPU::COPY), VGPRSrc) 3703 .addReg(Src); 3704 Src = VGPRSrc; 3705 } 3706 3707 return Src; 3708 } 3709 3710 /// 3711 /// This will select either an SGPR or VGPR operand and will save us from 3712 /// having to write an extra tablegen pattern. 3713 InstructionSelector::ComplexRendererFns 3714 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 3715 return {{ 3716 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 3717 }}; 3718 } 3719 3720 InstructionSelector::ComplexRendererFns 3721 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 3722 Register Src; 3723 unsigned Mods; 3724 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3725 3726 return {{ 3727 [=](MachineInstrBuilder &MIB) { 3728 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 3729 }, 3730 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3731 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3732 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3733 }}; 3734 } 3735 3736 InstructionSelector::ComplexRendererFns 3737 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { 3738 Register Src; 3739 unsigned Mods; 3740 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, 3741 /*IsCanonicalizing=*/true, 3742 /*AllowAbs=*/false); 3743 3744 return {{ 3745 [=](MachineInstrBuilder &MIB) { 3746 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 3747 }, 3748 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 3749 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3750 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3751 }}; 3752 } 3753 3754 InstructionSelector::ComplexRendererFns 3755 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 3756 return {{ 3757 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 3758 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 3759 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 3760 }}; 3761 } 3762 3763 InstructionSelector::ComplexRendererFns 3764 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 3765 Register Src; 3766 unsigned Mods; 3767 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 3768 3769 return {{ 3770 [=](MachineInstrBuilder &MIB) { 3771 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 3772 }, 3773 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3774 }}; 3775 } 3776 3777 InstructionSelector::ComplexRendererFns 3778 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( 3779 MachineOperand &Root) const { 3780 Register Src; 3781 unsigned Mods; 3782 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false); 3783 3784 return {{ 3785 [=](MachineInstrBuilder &MIB) { 3786 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 3787 }, 3788 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3789 }}; 3790 } 3791 3792 InstructionSelector::ComplexRendererFns 3793 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { 3794 Register Src; 3795 unsigned Mods; 3796 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true, 3797 /*AllowAbs=*/false); 3798 3799 return {{ 3800 [=](MachineInstrBuilder &MIB) { 3801 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 3802 }, 3803 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3804 }}; 3805 } 3806 3807 InstructionSelector::ComplexRendererFns 3808 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 3809 Register Reg = Root.getReg(); 3810 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 3811 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS) 3812 return {}; 3813 return {{ 3814 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 3815 }}; 3816 } 3817 3818 std::pair<Register, unsigned> 3819 AMDGPUInstructionSelector::selectVOP3PModsImpl( 3820 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { 3821 unsigned Mods = 0; 3822 MachineInstr *MI = MRI.getVRegDef(Src); 3823 3824 if (MI && MI->getOpcode() == AMDGPU::G_FNEG && 3825 // It's possible to see an f32 fneg here, but unlikely. 3826 // TODO: Treat f32 fneg as only high bit. 3827 MRI.getType(Src) == LLT::fixed_vector(2, 16)) { 3828 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 3829 Src = MI->getOperand(1).getReg(); 3830 MI = MRI.getVRegDef(Src); 3831 } 3832 3833 // TODO: Handle G_FSUB 0 as fneg 3834 3835 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 3836 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() 3837 3838 // Packed instructions do not have abs modifiers. 3839 Mods |= SISrcMods::OP_SEL_1; 3840 3841 return std::pair(Src, Mods); 3842 } 3843 3844 InstructionSelector::ComplexRendererFns 3845 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 3846 MachineRegisterInfo &MRI 3847 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3848 3849 Register Src; 3850 unsigned Mods; 3851 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); 3852 3853 return {{ 3854 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3855 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3856 }}; 3857 } 3858 3859 InstructionSelector::ComplexRendererFns 3860 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { 3861 MachineRegisterInfo &MRI 3862 = Root.getParent()->getParent()->getParent()->getRegInfo(); 3863 3864 Register Src; 3865 unsigned Mods; 3866 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); 3867 3868 return {{ 3869 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3870 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3871 }}; 3872 } 3873 3874 InstructionSelector::ComplexRendererFns 3875 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { 3876 // Literal i1 value set in intrinsic, represents SrcMods for the next operand. 3877 // Value is in Imm operand as i1 sign extended to int64_t. 3878 // 1(-1) promotes packed values to signed, 0 treats them as unsigned. 3879 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && 3880 "expected i1 value"); 3881 unsigned Mods = SISrcMods::OP_SEL_1; 3882 if (Root.getImm() == -1) 3883 Mods ^= SISrcMods::NEG; 3884 return {{ 3885 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3886 }}; 3887 } 3888 3889 InstructionSelector::ComplexRendererFns 3890 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( 3891 MachineOperand &Root) const { 3892 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && 3893 "expected i1 value"); 3894 unsigned Mods = SISrcMods::OP_SEL_1; 3895 if (Root.getImm() != 0) 3896 Mods |= SISrcMods::OP_SEL_0; 3897 3898 return {{ 3899 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 3900 }}; 3901 } 3902 3903 static Register buildRegSequence(SmallVectorImpl<Register> &Elts, 3904 MachineInstr *InsertPt, 3905 MachineRegisterInfo &MRI) { 3906 const TargetRegisterClass *DstRegClass; 3907 switch (Elts.size()) { 3908 case 8: 3909 DstRegClass = &AMDGPU::VReg_256RegClass; 3910 break; 3911 case 4: 3912 DstRegClass = &AMDGPU::VReg_128RegClass; 3913 break; 3914 case 2: 3915 DstRegClass = &AMDGPU::VReg_64RegClass; 3916 break; 3917 default: 3918 llvm_unreachable("unhandled Reg sequence size"); 3919 } 3920 3921 MachineIRBuilder B(*InsertPt); 3922 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE) 3923 .addDef(MRI.createVirtualRegister(DstRegClass)); 3924 for (unsigned i = 0; i < Elts.size(); ++i) { 3925 MIB.addReg(Elts[i]); 3926 MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i)); 3927 } 3928 return MIB->getOperand(0).getReg(); 3929 } 3930 3931 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, 3932 SmallVectorImpl<Register> &Elts, Register &Src, 3933 MachineInstr *InsertPt, 3934 MachineRegisterInfo &MRI) { 3935 if (ModOpcode == TargetOpcode::G_FNEG) { 3936 Mods |= SISrcMods::NEG; 3937 // Check if all elements also have abs modifier 3938 SmallVector<Register, 8> NegAbsElts; 3939 for (auto El : Elts) { 3940 Register FabsSrc; 3941 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc)))) 3942 break; 3943 NegAbsElts.push_back(FabsSrc); 3944 } 3945 if (Elts.size() != NegAbsElts.size()) { 3946 // Neg 3947 Src = buildRegSequence(Elts, InsertPt, MRI); 3948 } else { 3949 // Neg and Abs 3950 Mods |= SISrcMods::NEG_HI; 3951 Src = buildRegSequence(NegAbsElts, InsertPt, MRI); 3952 } 3953 } else { 3954 assert(ModOpcode == TargetOpcode::G_FABS); 3955 // Abs 3956 Mods |= SISrcMods::NEG_HI; 3957 Src = buildRegSequence(Elts, InsertPt, MRI); 3958 } 3959 } 3960 3961 InstructionSelector::ComplexRendererFns 3962 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const { 3963 Register Src = Root.getReg(); 3964 unsigned Mods = SISrcMods::OP_SEL_1; 3965 SmallVector<Register, 8> EltsF32; 3966 3967 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) { 3968 assert(BV->getNumSources() > 0); 3969 // Based on first element decide which mod we match, neg or abs 3970 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0)); 3971 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) 3972 ? AMDGPU::G_FNEG 3973 : AMDGPU::G_FABS; 3974 for (unsigned i = 0; i < BV->getNumSources(); ++i) { 3975 ElF32 = MRI->getVRegDef(BV->getSourceReg(i)); 3976 if (ElF32->getOpcode() != ModOpcode) 3977 break; 3978 EltsF32.push_back(ElF32->getOperand(1).getReg()); 3979 } 3980 3981 // All elements had ModOpcode modifier 3982 if (BV->getNumSources() == EltsF32.size()) { 3983 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(), 3984 *MRI); 3985 } 3986 } 3987 3988 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 3989 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 3990 } 3991 3992 InstructionSelector::ComplexRendererFns 3993 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const { 3994 Register Src = Root.getReg(); 3995 unsigned Mods = SISrcMods::OP_SEL_1; 3996 SmallVector<Register, 8> EltsV2F16; 3997 3998 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) { 3999 for (unsigned i = 0; i < CV->getNumSources(); ++i) { 4000 Register FNegSrc; 4001 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc)))) 4002 break; 4003 EltsV2F16.push_back(FNegSrc); 4004 } 4005 4006 // All elements had ModOpcode modifier 4007 if (CV->getNumSources() == EltsV2F16.size()) { 4008 Mods |= SISrcMods::NEG; 4009 Mods |= SISrcMods::NEG_HI; 4010 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI); 4011 } 4012 } 4013 4014 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4015 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 4016 } 4017 4018 InstructionSelector::ComplexRendererFns 4019 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const { 4020 Register Src = Root.getReg(); 4021 unsigned Mods = SISrcMods::OP_SEL_1; 4022 SmallVector<Register, 8> EltsV2F16; 4023 4024 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) { 4025 assert(CV->getNumSources() > 0); 4026 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0)); 4027 // Based on first element decide which mod we match, neg or abs 4028 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) 4029 ? AMDGPU::G_FNEG 4030 : AMDGPU::G_FABS; 4031 4032 for (unsigned i = 0; i < CV->getNumSources(); ++i) { 4033 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i)); 4034 if (ElV2F16->getOpcode() != ModOpcode) 4035 break; 4036 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg()); 4037 } 4038 4039 // All elements had ModOpcode modifier 4040 if (CV->getNumSources() == EltsV2F16.size()) { 4041 MachineIRBuilder B(*Root.getParent()); 4042 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(), 4043 *MRI); 4044 } 4045 } 4046 4047 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 4049 } 4050 4051 InstructionSelector::ComplexRendererFns 4052 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { 4053 std::optional<FPValueAndVReg> FPValReg; 4054 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) { 4055 if (TII.isInlineConstant(FPValReg->Value)) { 4056 return {{[=](MachineInstrBuilder &MIB) { 4057 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue()); 4058 }}}; 4059 } 4060 // Non-inlineable splat floats should not fall-through for integer immediate 4061 // checks. 4062 return {}; 4063 } 4064 4065 APInt ICst; 4066 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) { 4067 if (TII.isInlineConstant(ICst)) { 4068 return { 4069 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}}; 4070 } 4071 } 4072 4073 return {}; 4074 } 4075 4076 InstructionSelector::ComplexRendererFns 4077 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const { 4078 Register Src = 4079 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); 4080 unsigned Key = 0; 4081 4082 Register ShiftSrc; 4083 std::optional<ValueAndVReg> ShiftAmt; 4084 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && 4085 MRI->getType(ShiftSrc).getSizeInBits() == 32 && 4086 ShiftAmt->Value.getZExtValue() % 8 == 0) { 4087 Key = ShiftAmt->Value.getZExtValue() / 8; 4088 Src = ShiftSrc; 4089 } 4090 4091 return {{ 4092 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4093 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key 4094 }}; 4095 } 4096 4097 InstructionSelector::ComplexRendererFns 4098 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { 4099 4100 Register Src = 4101 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); 4102 unsigned Key = 0; 4103 4104 Register ShiftSrc; 4105 std::optional<ValueAndVReg> ShiftAmt; 4106 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && 4107 MRI->getType(ShiftSrc).getSizeInBits() == 32 && 4108 ShiftAmt->Value.getZExtValue() == 16) { 4109 Src = ShiftSrc; 4110 Key = 1; 4111 } 4112 4113 return {{ 4114 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4115 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key 4116 }}; 4117 } 4118 4119 InstructionSelector::ComplexRendererFns 4120 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 4121 Register Src; 4122 unsigned Mods; 4123 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 4124 4125 // FIXME: Handle op_sel 4126 return {{ 4127 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 4128 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4129 }}; 4130 } 4131 4132 InstructionSelector::ComplexRendererFns 4133 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { 4134 Register Src; 4135 unsigned Mods; 4136 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, 4137 /*IsCanonicalizing=*/true, 4138 /*AllowAbs=*/false, 4139 /*OpSel=*/false); 4140 4141 return {{ 4142 [=](MachineInstrBuilder &MIB) { 4143 MIB.addReg( 4144 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true)); 4145 }, 4146 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4147 }}; 4148 } 4149 4150 InstructionSelector::ComplexRendererFns 4151 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { 4152 Register Src; 4153 unsigned Mods; 4154 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, 4155 /*IsCanonicalizing=*/true, 4156 /*AllowAbs=*/false, 4157 /*OpSel=*/true); 4158 4159 return {{ 4160 [=](MachineInstrBuilder &MIB) { 4161 MIB.addReg( 4162 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true)); 4163 }, 4164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4165 }}; 4166 } 4167 4168 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, 4169 Register &Base, 4170 Register *SOffset, 4171 int64_t *Offset) const { 4172 MachineInstr *MI = Root.getParent(); 4173 MachineBasicBlock *MBB = MI->getParent(); 4174 4175 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 4176 // then we can select all ptr + 32-bit offsets. 4177 SmallVector<GEPInfo, 4> AddrInfo; 4178 getAddrModeInfo(*MI, *MRI, AddrInfo); 4179 4180 if (AddrInfo.empty()) 4181 return false; 4182 4183 const GEPInfo &GEPI = AddrInfo[0]; 4184 std::optional<int64_t> EncodedImm; 4185 4186 if (SOffset && Offset) { 4187 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, 4188 /*HasSOffset=*/true); 4189 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && 4190 AddrInfo.size() > 1) { 4191 const GEPInfo &GEPI2 = AddrInfo[1]; 4192 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { 4193 if (Register OffsetReg = 4194 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { 4195 Base = GEPI2.SgprParts[0]; 4196 *SOffset = OffsetReg; 4197 *Offset = *EncodedImm; 4198 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI)) 4199 return true; 4200 4201 // For unbuffered smem loads, it is illegal for the Immediate Offset 4202 // to be negative if the resulting (Offset + (M0 or SOffset or zero) 4203 // is negative. Handle the case where the Immediate Offset + SOffset 4204 // is negative. 4205 auto SKnown = KB->getKnownBits(*SOffset); 4206 if (*Offset + SKnown.getMinValue().getSExtValue() < 0) 4207 return false; 4208 4209 return true; 4210 } 4211 } 4212 } 4213 return false; 4214 } 4215 4216 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, 4217 /*HasSOffset=*/false); 4218 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { 4219 Base = GEPI.SgprParts[0]; 4220 *Offset = *EncodedImm; 4221 return true; 4222 } 4223 4224 // SGPR offset is unsigned. 4225 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) && 4226 GEPI.Imm != 0) { 4227 // If we make it this far we have a load with an 32-bit immediate offset. 4228 // It is OK to select this using a sgpr offset, because we have already 4229 // failed trying to select this load into one of the _IMM variants since 4230 // the _IMM Patterns are considered before the _SGPR patterns. 4231 Base = GEPI.SgprParts[0]; 4232 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 4233 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset) 4234 .addImm(GEPI.Imm); 4235 return true; 4236 } 4237 4238 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { 4239 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { 4240 Base = GEPI.SgprParts[0]; 4241 *SOffset = OffsetReg; 4242 return true; 4243 } 4244 } 4245 4246 return false; 4247 } 4248 4249 InstructionSelector::ComplexRendererFns 4250 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 4251 Register Base; 4252 int64_t Offset; 4253 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) 4254 return std::nullopt; 4255 4256 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 4257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; 4258 } 4259 4260 InstructionSelector::ComplexRendererFns 4261 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 4262 SmallVector<GEPInfo, 4> AddrInfo; 4263 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 4264 4265 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 4266 return std::nullopt; 4267 4268 const GEPInfo &GEPInfo = AddrInfo[0]; 4269 Register PtrReg = GEPInfo.SgprParts[0]; 4270 std::optional<int64_t> EncodedImm = 4271 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 4272 if (!EncodedImm) 4273 return std::nullopt; 4274 4275 return {{ 4276 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 4277 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 4278 }}; 4279 } 4280 4281 InstructionSelector::ComplexRendererFns 4282 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 4283 Register Base, SOffset; 4284 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) 4285 return std::nullopt; 4286 4287 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 4288 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; 4289 } 4290 4291 InstructionSelector::ComplexRendererFns 4292 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { 4293 Register Base, SOffset; 4294 int64_t Offset; 4295 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) 4296 return std::nullopt; 4297 4298 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 4299 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, 4300 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; 4301 } 4302 4303 std::pair<Register, int> 4304 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, 4305 uint64_t FlatVariant) const { 4306 MachineInstr *MI = Root.getParent(); 4307 4308 auto Default = std::pair(Root.getReg(), 0); 4309 4310 if (!STI.hasFlatInstOffsets()) 4311 return Default; 4312 4313 Register PtrBase; 4314 int64_t ConstOffset; 4315 std::tie(PtrBase, ConstOffset) = 4316 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 4317 4318 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && 4319 !isFlatScratchBaseLegal(Root.getReg()))) 4320 return Default; 4321 4322 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 4323 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant)) 4324 return Default; 4325 4326 return std::pair(PtrBase, ConstOffset); 4327 } 4328 4329 InstructionSelector::ComplexRendererFns 4330 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 4331 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT); 4332 4333 return {{ 4334 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 4335 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 4336 }}; 4337 } 4338 4339 InstructionSelector::ComplexRendererFns 4340 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { 4341 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal); 4342 4343 return {{ 4344 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 4345 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 4346 }}; 4347 } 4348 4349 InstructionSelector::ComplexRendererFns 4350 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { 4351 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch); 4352 4353 return {{ 4354 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 4355 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 4356 }}; 4357 } 4358 4359 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 4360 InstructionSelector::ComplexRendererFns 4361 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { 4362 Register Addr = Root.getReg(); 4363 Register PtrBase; 4364 int64_t ConstOffset; 4365 int64_t ImmOffset = 0; 4366 4367 // Match the immediate offset first, which canonically is moved as low as 4368 // possible. 4369 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 4370 4371 if (ConstOffset != 0) { 4372 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, 4373 SIInstrFlags::FlatGlobal)) { 4374 Addr = PtrBase; 4375 ImmOffset = ConstOffset; 4376 } else { 4377 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); 4378 if (isSGPR(PtrBaseDef->Reg)) { 4379 if (ConstOffset > 0) { 4380 // Offset is too large. 4381 // 4382 // saddr + large_offset -> saddr + 4383 // (voffset = large_offset & ~MaxOffset) + 4384 // (large_offset & MaxOffset); 4385 int64_t SplitImmOffset, RemainderOffset; 4386 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( 4387 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); 4388 4389 if (isUInt<32>(RemainderOffset)) { 4390 MachineInstr *MI = Root.getParent(); 4391 MachineBasicBlock *MBB = MI->getParent(); 4392 Register HighBits = 4393 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4394 4395 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 4396 HighBits) 4397 .addImm(RemainderOffset); 4398 4399 return {{ 4400 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr 4401 [=](MachineInstrBuilder &MIB) { 4402 MIB.addReg(HighBits); 4403 }, // voffset 4404 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, 4405 }}; 4406 } 4407 } 4408 4409 // We are adding a 64 bit SGPR and a constant. If constant bus limit 4410 // is 1 we would need to perform 1 or 2 extra moves for each half of 4411 // the constant and it is better to do a scalar add and then issue a 4412 // single VALU instruction to materialize zero. Otherwise it is less 4413 // instructions to perform VALU adds with immediates or inline literals. 4414 unsigned NumLiterals = 4415 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) + 4416 !TII.isInlineConstant(APInt(32, ConstOffset >> 32)); 4417 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) 4418 return std::nullopt; 4419 } 4420 } 4421 } 4422 4423 // Match the variable offset. 4424 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 4425 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 4426 // Look through the SGPR->VGPR copy. 4427 Register SAddr = 4428 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 4429 4430 if (isSGPR(SAddr)) { 4431 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 4432 4433 // It's possible voffset is an SGPR here, but the copy to VGPR will be 4434 // inserted later. 4435 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 4436 return {{[=](MachineInstrBuilder &MIB) { // saddr 4437 MIB.addReg(SAddr); 4438 }, 4439 [=](MachineInstrBuilder &MIB) { // voffset 4440 MIB.addReg(VOffset); 4441 }, 4442 [=](MachineInstrBuilder &MIB) { // offset 4443 MIB.addImm(ImmOffset); 4444 }}}; 4445 } 4446 } 4447 } 4448 4449 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and 4450 // drop this. 4451 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || 4452 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) 4453 return std::nullopt; 4454 4455 // It's cheaper to materialize a single 32-bit zero for vaddr than the two 4456 // moves required to copy a 64-bit SGPR to VGPR. 4457 MachineInstr *MI = Root.getParent(); 4458 MachineBasicBlock *MBB = MI->getParent(); 4459 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4460 4461 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 4462 .addImm(0); 4463 4464 return {{ 4465 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr 4466 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset 4467 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4468 }}; 4469 } 4470 4471 InstructionSelector::ComplexRendererFns 4472 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { 4473 Register Addr = Root.getReg(); 4474 Register PtrBase; 4475 int64_t ConstOffset; 4476 int64_t ImmOffset = 0; 4477 4478 // Match the immediate offset first, which canonically is moved as low as 4479 // possible. 4480 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 4481 4482 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && 4483 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, 4484 SIInstrFlags::FlatScratch)) { 4485 Addr = PtrBase; 4486 ImmOffset = ConstOffset; 4487 } 4488 4489 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 4490 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 4491 int FI = AddrDef->MI->getOperand(1).getIndex(); 4492 return {{ 4493 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 4494 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4495 }}; 4496 } 4497 4498 Register SAddr = AddrDef->Reg; 4499 4500 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 4501 Register LHS = AddrDef->MI->getOperand(1).getReg(); 4502 Register RHS = AddrDef->MI->getOperand(2).getReg(); 4503 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 4504 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); 4505 4506 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && 4507 isSGPR(RHSDef->Reg)) { 4508 int FI = LHSDef->MI->getOperand(1).getIndex(); 4509 MachineInstr &I = *Root.getParent(); 4510 MachineBasicBlock *BB = I.getParent(); 4511 const DebugLoc &DL = I.getDebugLoc(); 4512 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 4513 4514 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) 4515 .addFrameIndex(FI) 4516 .addReg(RHSDef->Reg) 4517 .setOperandDead(3); // Dead scc 4518 } 4519 } 4520 4521 if (!isSGPR(SAddr)) 4522 return std::nullopt; 4523 4524 return {{ 4525 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr 4526 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4527 }}; 4528 } 4529 4530 // Check whether the flat scratch SVS swizzle bug affects this access. 4531 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( 4532 Register VAddr, Register SAddr, uint64_t ImmOffset) const { 4533 if (!Subtarget->hasFlatScratchSVSSwizzleBug()) 4534 return false; 4535 4536 // The bug affects the swizzling of SVS accesses if there is any carry out 4537 // from the two low order bits (i.e. from bit 1 into bit 2) when adding 4538 // voffset to (soffset + inst_offset). 4539 auto VKnown = KB->getKnownBits(VAddr); 4540 auto SKnown = KnownBits::computeForAddSub( 4541 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr), 4542 KnownBits::makeConstant(APInt(32, ImmOffset))); 4543 uint64_t VMax = VKnown.getMaxValue().getZExtValue(); 4544 uint64_t SMax = SKnown.getMaxValue().getZExtValue(); 4545 return (VMax & 3) + (SMax & 3) >= 4; 4546 } 4547 4548 InstructionSelector::ComplexRendererFns 4549 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { 4550 Register Addr = Root.getReg(); 4551 Register PtrBase; 4552 int64_t ConstOffset; 4553 int64_t ImmOffset = 0; 4554 4555 // Match the immediate offset first, which canonically is moved as low as 4556 // possible. 4557 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 4558 4559 Register OrigAddr = Addr; 4560 if (ConstOffset != 0 && 4561 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) { 4562 Addr = PtrBase; 4563 ImmOffset = ConstOffset; 4564 } 4565 4566 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 4567 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) 4568 return std::nullopt; 4569 4570 Register RHS = AddrDef->MI->getOperand(2).getReg(); 4571 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) 4572 return std::nullopt; 4573 4574 Register LHS = AddrDef->MI->getOperand(1).getReg(); 4575 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 4576 4577 if (OrigAddr != Addr) { 4578 if (!isFlatScratchBaseLegalSVImm(OrigAddr)) 4579 return std::nullopt; 4580 } else { 4581 if (!isFlatScratchBaseLegalSV(OrigAddr)) 4582 return std::nullopt; 4583 } 4584 4585 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) 4586 return std::nullopt; 4587 4588 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 4589 int FI = LHSDef->MI->getOperand(1).getIndex(); 4590 return {{ 4591 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 4592 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 4593 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4594 }}; 4595 } 4596 4597 if (!isSGPR(LHS)) 4598 return std::nullopt; 4599 4600 return {{ 4601 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 4602 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr 4603 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 4604 }}; 4605 } 4606 4607 InstructionSelector::ComplexRendererFns 4608 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 4609 MachineInstr *MI = Root.getParent(); 4610 MachineBasicBlock *MBB = MI->getParent(); 4611 MachineFunction *MF = MBB->getParent(); 4612 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 4613 4614 int64_t Offset = 0; 4615 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 4616 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 4617 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 4618 4619 // TODO: Should this be inside the render function? The iterator seems to 4620 // move. 4621 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); 4622 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 4623 HighBits) 4624 .addImm(Offset & ~MaxOffset); 4625 4626 return {{[=](MachineInstrBuilder &MIB) { // rsrc 4627 MIB.addReg(Info->getScratchRSrcReg()); 4628 }, 4629 [=](MachineInstrBuilder &MIB) { // vaddr 4630 MIB.addReg(HighBits); 4631 }, 4632 [=](MachineInstrBuilder &MIB) { // soffset 4633 // Use constant zero for soffset and rely on eliminateFrameIndex 4634 // to choose the appropriate frame register if need be. 4635 MIB.addImm(0); 4636 }, 4637 [=](MachineInstrBuilder &MIB) { // offset 4638 MIB.addImm(Offset & MaxOffset); 4639 }}}; 4640 } 4641 4642 assert(Offset == 0 || Offset == -1); 4643 4644 // Try to fold a frame index directly into the MUBUF vaddr field, and any 4645 // offsets. 4646 std::optional<int> FI; 4647 Register VAddr = Root.getReg(); 4648 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) { 4649 Register PtrBase; 4650 int64_t ConstOffset; 4651 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); 4652 if (ConstOffset != 0) { 4653 if (TII.isLegalMUBUFImmOffset(ConstOffset) && 4654 (!STI.privateMemoryResourceIsRangeChecked() || 4655 KB->signBitIsZero(PtrBase))) { 4656 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); 4657 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 4658 FI = PtrBaseDef->getOperand(1).getIndex(); 4659 else 4660 VAddr = PtrBase; 4661 Offset = ConstOffset; 4662 } 4663 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 4664 FI = RootDef->getOperand(1).getIndex(); 4665 } 4666 } 4667 4668 return {{[=](MachineInstrBuilder &MIB) { // rsrc 4669 MIB.addReg(Info->getScratchRSrcReg()); 4670 }, 4671 [=](MachineInstrBuilder &MIB) { // vaddr 4672 if (FI) 4673 MIB.addFrameIndex(*FI); 4674 else 4675 MIB.addReg(VAddr); 4676 }, 4677 [=](MachineInstrBuilder &MIB) { // soffset 4678 // Use constant zero for soffset and rely on eliminateFrameIndex 4679 // to choose the appropriate frame register if need be. 4680 MIB.addImm(0); 4681 }, 4682 [=](MachineInstrBuilder &MIB) { // offset 4683 MIB.addImm(Offset); 4684 }}}; 4685 } 4686 4687 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 4688 int64_t Offset) const { 4689 if (!isUInt<16>(Offset)) 4690 return false; 4691 4692 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 4693 return true; 4694 4695 // On Southern Islands instruction with a negative base value and an offset 4696 // don't seem to work. 4697 return KB->signBitIsZero(Base); 4698 } 4699 4700 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, 4701 int64_t Offset1, 4702 unsigned Size) const { 4703 if (Offset0 % Size != 0 || Offset1 % Size != 0) 4704 return false; 4705 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) 4706 return false; 4707 4708 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 4709 return true; 4710 4711 // On Southern Islands instruction with a negative base value and an offset 4712 // don't seem to work. 4713 return KB->signBitIsZero(Base); 4714 } 4715 4716 // Return whether the operation has NoUnsignedWrap property. 4717 static bool isNoUnsignedWrap(MachineInstr *Addr) { 4718 return Addr->getOpcode() == TargetOpcode::G_OR || 4719 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD && 4720 Addr->getFlag(MachineInstr::NoUWrap)); 4721 } 4722 4723 // Check that the base address of flat scratch load/store in the form of `base + 4724 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware 4725 // requirement). We always treat the first operand as the base address here. 4726 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { 4727 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 4728 4729 if (isNoUnsignedWrap(AddrMI)) 4730 return true; 4731 4732 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 4733 // values. 4734 if (STI.hasSignedScratchOffsets()) 4735 return true; 4736 4737 Register LHS = AddrMI->getOperand(1).getReg(); 4738 Register RHS = AddrMI->getOperand(2).getReg(); 4739 4740 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { 4741 std::optional<ValueAndVReg> RhsValReg = 4742 getIConstantVRegValWithLookThrough(RHS, *MRI); 4743 // If the immediate offset is negative and within certain range, the base 4744 // address cannot also be negative. If the base is also negative, the sum 4745 // would be either negative or much larger than the valid range of scratch 4746 // memory a thread can access. 4747 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 && 4748 RhsValReg->Value.getSExtValue() > -0x40000000) 4749 return true; 4750 } 4751 4752 return KB->signBitIsZero(LHS); 4753 } 4754 4755 // Check address value in SGPR/VGPR are legal for flat scratch in the form 4756 // of: SGPR + VGPR. 4757 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { 4758 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 4759 4760 if (isNoUnsignedWrap(AddrMI)) 4761 return true; 4762 4763 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 4764 // values. 4765 if (STI.hasSignedScratchOffsets()) 4766 return true; 4767 4768 Register LHS = AddrMI->getOperand(1).getReg(); 4769 Register RHS = AddrMI->getOperand(2).getReg(); 4770 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); 4771 } 4772 4773 // Check address value in SGPR/VGPR are legal for flat scratch in the form 4774 // of: SGPR + VGPR + Imm. 4775 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( 4776 Register Addr) const { 4777 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 4778 // values. 4779 if (STI.hasSignedScratchOffsets()) 4780 return true; 4781 4782 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 4783 Register Base = AddrMI->getOperand(1).getReg(); 4784 std::optional<DefinitionAndSourceRegister> BaseDef = 4785 getDefSrcRegIgnoringCopies(Base, *MRI); 4786 std::optional<ValueAndVReg> RHSOffset = 4787 getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI); 4788 assert(RHSOffset); 4789 4790 // If the immediate offset is negative and within certain range, the base 4791 // address cannot also be negative. If the base is also negative, the sum 4792 // would be either negative or much larger than the valid range of scratch 4793 // memory a thread can access. 4794 if (isNoUnsignedWrap(BaseDef->MI) && 4795 (isNoUnsignedWrap(AddrMI) || 4796 (RHSOffset->Value.getSExtValue() < 0 && 4797 RHSOffset->Value.getSExtValue() > -0x40000000))) 4798 return true; 4799 4800 Register LHS = BaseDef->MI->getOperand(1).getReg(); 4801 Register RHS = BaseDef->MI->getOperand(2).getReg(); 4802 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS); 4803 } 4804 4805 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, 4806 unsigned ShAmtBits) const { 4807 assert(MI.getOpcode() == TargetOpcode::G_AND); 4808 4809 std::optional<APInt> RHS = 4810 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI); 4811 if (!RHS) 4812 return false; 4813 4814 if (RHS->countr_one() >= ShAmtBits) 4815 return true; 4816 4817 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg()); 4818 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits; 4819 } 4820 4821 InstructionSelector::ComplexRendererFns 4822 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 4823 MachineOperand &Root) const { 4824 Register Reg = Root.getReg(); 4825 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 4826 4827 std::optional<DefinitionAndSourceRegister> Def = 4828 getDefSrcRegIgnoringCopies(Reg, *MRI); 4829 assert(Def && "this shouldn't be an optional result"); 4830 Reg = Def->Reg; 4831 4832 if (Register WaveBase = getWaveAddress(Def->MI)) { 4833 return {{ 4834 [=](MachineInstrBuilder &MIB) { // rsrc 4835 MIB.addReg(Info->getScratchRSrcReg()); 4836 }, 4837 [=](MachineInstrBuilder &MIB) { // soffset 4838 MIB.addReg(WaveBase); 4839 }, 4840 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset 4841 }}; 4842 } 4843 4844 int64_t Offset = 0; 4845 4846 // FIXME: Copy check is a hack 4847 Register BasePtr; 4848 if (mi_match(Reg, *MRI, 4849 m_GPtrAdd(m_Reg(BasePtr), 4850 m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) { 4851 if (!TII.isLegalMUBUFImmOffset(Offset)) 4852 return {}; 4853 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI); 4854 Register WaveBase = getWaveAddress(BasePtrDef); 4855 if (!WaveBase) 4856 return {}; 4857 4858 return {{ 4859 [=](MachineInstrBuilder &MIB) { // rsrc 4860 MIB.addReg(Info->getScratchRSrcReg()); 4861 }, 4862 [=](MachineInstrBuilder &MIB) { // soffset 4863 MIB.addReg(WaveBase); 4864 }, 4865 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 4866 }}; 4867 } 4868 4869 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 4870 !TII.isLegalMUBUFImmOffset(Offset)) 4871 return {}; 4872 4873 return {{ 4874 [=](MachineInstrBuilder &MIB) { // rsrc 4875 MIB.addReg(Info->getScratchRSrcReg()); 4876 }, 4877 [=](MachineInstrBuilder &MIB) { // soffset 4878 MIB.addImm(0); 4879 }, 4880 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 4881 }}; 4882 } 4883 4884 std::pair<Register, unsigned> 4885 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 4886 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 4887 if (!RootDef) 4888 return std::pair(Root.getReg(), 0); 4889 4890 int64_t ConstAddr = 0; 4891 4892 Register PtrBase; 4893 int64_t Offset; 4894 std::tie(PtrBase, Offset) = 4895 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 4896 4897 if (Offset) { 4898 if (isDSOffsetLegal(PtrBase, Offset)) { 4899 // (add n0, c0) 4900 return std::pair(PtrBase, Offset); 4901 } 4902 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 4903 // TODO 4904 4905 4906 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 4907 // TODO 4908 4909 } 4910 4911 return std::pair(Root.getReg(), 0); 4912 } 4913 4914 InstructionSelector::ComplexRendererFns 4915 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 4916 Register Reg; 4917 unsigned Offset; 4918 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 4919 return {{ 4920 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 4921 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 4922 }}; 4923 } 4924 4925 InstructionSelector::ComplexRendererFns 4926 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 4927 return selectDSReadWrite2(Root, 4); 4928 } 4929 4930 InstructionSelector::ComplexRendererFns 4931 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { 4932 return selectDSReadWrite2(Root, 8); 4933 } 4934 4935 InstructionSelector::ComplexRendererFns 4936 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, 4937 unsigned Size) const { 4938 Register Reg; 4939 unsigned Offset; 4940 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size); 4941 return {{ 4942 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 4943 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 4944 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 4945 }}; 4946 } 4947 4948 std::pair<Register, unsigned> 4949 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, 4950 unsigned Size) const { 4951 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 4952 if (!RootDef) 4953 return std::pair(Root.getReg(), 0); 4954 4955 int64_t ConstAddr = 0; 4956 4957 Register PtrBase; 4958 int64_t Offset; 4959 std::tie(PtrBase, Offset) = 4960 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 4961 4962 if (Offset) { 4963 int64_t OffsetValue0 = Offset; 4964 int64_t OffsetValue1 = Offset + Size; 4965 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) { 4966 // (add n0, c0) 4967 return std::pair(PtrBase, OffsetValue0 / Size); 4968 } 4969 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 4970 // TODO 4971 4972 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 4973 // TODO 4974 4975 } 4976 4977 return std::pair(Root.getReg(), 0); 4978 } 4979 4980 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 4981 /// the base value with the constant offset. There may be intervening copies 4982 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 4983 /// not match the pattern. 4984 std::pair<Register, int64_t> 4985 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 4986 Register Root, const MachineRegisterInfo &MRI) const { 4987 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); 4988 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 4989 return {Root, 0}; 4990 4991 MachineOperand &RHS = RootI->getOperand(2); 4992 std::optional<ValueAndVReg> MaybeOffset = 4993 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 4994 if (!MaybeOffset) 4995 return {Root, 0}; 4996 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; 4997 } 4998 4999 static void addZeroImm(MachineInstrBuilder &MIB) { 5000 MIB.addImm(0); 5001 } 5002 5003 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 5004 /// BasePtr is not valid, a null base pointer will be used. 5005 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 5006 uint32_t FormatLo, uint32_t FormatHi, 5007 Register BasePtr) { 5008 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5009 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5010 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5011 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 5012 5013 B.buildInstr(AMDGPU::S_MOV_B32) 5014 .addDef(RSrc2) 5015 .addImm(FormatLo); 5016 B.buildInstr(AMDGPU::S_MOV_B32) 5017 .addDef(RSrc3) 5018 .addImm(FormatHi); 5019 5020 // Build the half of the subregister with the constants before building the 5021 // full 128-bit register. If we are building multiple resource descriptors, 5022 // this will allow CSEing of the 2-component register. 5023 B.buildInstr(AMDGPU::REG_SEQUENCE) 5024 .addDef(RSrcHi) 5025 .addReg(RSrc2) 5026 .addImm(AMDGPU::sub0) 5027 .addReg(RSrc3) 5028 .addImm(AMDGPU::sub1); 5029 5030 Register RSrcLo = BasePtr; 5031 if (!BasePtr) { 5032 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5033 B.buildInstr(AMDGPU::S_MOV_B64) 5034 .addDef(RSrcLo) 5035 .addImm(0); 5036 } 5037 5038 B.buildInstr(AMDGPU::REG_SEQUENCE) 5039 .addDef(RSrc) 5040 .addReg(RSrcLo) 5041 .addImm(AMDGPU::sub0_sub1) 5042 .addReg(RSrcHi) 5043 .addImm(AMDGPU::sub2_sub3); 5044 5045 return RSrc; 5046 } 5047 5048 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 5049 const SIInstrInfo &TII, Register BasePtr) { 5050 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 5051 5052 // FIXME: Why are half the "default" bits ignored based on the addressing 5053 // mode? 5054 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 5055 } 5056 5057 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 5058 const SIInstrInfo &TII, Register BasePtr) { 5059 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 5060 5061 // FIXME: Why are half the "default" bits ignored based on the addressing 5062 // mode? 5063 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 5064 } 5065 5066 AMDGPUInstructionSelector::MUBUFAddressData 5067 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 5068 MUBUFAddressData Data; 5069 Data.N0 = Src; 5070 5071 Register PtrBase; 5072 int64_t Offset; 5073 5074 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 5075 if (isUInt<32>(Offset)) { 5076 Data.N0 = PtrBase; 5077 Data.Offset = Offset; 5078 } 5079 5080 if (MachineInstr *InputAdd 5081 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 5082 Data.N2 = InputAdd->getOperand(1).getReg(); 5083 Data.N3 = InputAdd->getOperand(2).getReg(); 5084 5085 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 5086 // FIXME: Don't know this was defined by operand 0 5087 // 5088 // TODO: Remove this when we have copy folding optimizations after 5089 // RegBankSelect. 5090 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 5091 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 5092 } 5093 5094 return Data; 5095 } 5096 5097 /// Return if the addr64 mubuf mode should be used for the given address. 5098 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 5099 // (ptr_add N2, N3) -> addr64, or 5100 // (ptr_add (ptr_add N2, N3), C1) -> addr64 5101 if (Addr.N2) 5102 return true; 5103 5104 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 5105 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 5106 } 5107 5108 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 5109 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 5110 /// component. 5111 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 5112 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 5113 if (TII.isLegalMUBUFImmOffset(ImmOffset)) 5114 return; 5115 5116 // Illegal offset, store it in soffset. 5117 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5118 B.buildInstr(AMDGPU::S_MOV_B32) 5119 .addDef(SOffset) 5120 .addImm(ImmOffset); 5121 ImmOffset = 0; 5122 } 5123 5124 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 5125 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 5126 Register &SOffset, int64_t &Offset) const { 5127 // FIXME: Predicates should stop this from reaching here. 5128 // addr64 bit was removed for volcanic islands. 5129 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 5130 return false; 5131 5132 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 5133 if (!shouldUseAddr64(AddrData)) 5134 return false; 5135 5136 Register N0 = AddrData.N0; 5137 Register N2 = AddrData.N2; 5138 Register N3 = AddrData.N3; 5139 Offset = AddrData.Offset; 5140 5141 // Base pointer for the SRD. 5142 Register SRDPtr; 5143 5144 if (N2) { 5145 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 5146 assert(N3); 5147 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 5148 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 5149 // addr64, and construct the default resource from a 0 address. 5150 VAddr = N0; 5151 } else { 5152 SRDPtr = N3; 5153 VAddr = N2; 5154 } 5155 } else { 5156 // N2 is not divergent. 5157 SRDPtr = N2; 5158 VAddr = N3; 5159 } 5160 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 5161 // Use the default null pointer in the resource 5162 VAddr = N0; 5163 } else { 5164 // N0 -> offset, or 5165 // (N0 + C1) -> offset 5166 SRDPtr = N0; 5167 } 5168 5169 MachineIRBuilder B(*Root.getParent()); 5170 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 5171 splitIllegalMUBUFOffset(B, SOffset, Offset); 5172 return true; 5173 } 5174 5175 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 5176 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 5177 int64_t &Offset) const { 5178 5179 // FIXME: Pattern should not reach here. 5180 if (STI.useFlatForGlobal()) 5181 return false; 5182 5183 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 5184 if (shouldUseAddr64(AddrData)) 5185 return false; 5186 5187 // N0 -> offset, or 5188 // (N0 + C1) -> offset 5189 Register SRDPtr = AddrData.N0; 5190 Offset = AddrData.Offset; 5191 5192 // TODO: Look through extensions for 32-bit soffset. 5193 MachineIRBuilder B(*Root.getParent()); 5194 5195 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 5196 splitIllegalMUBUFOffset(B, SOffset, Offset); 5197 return true; 5198 } 5199 5200 InstructionSelector::ComplexRendererFns 5201 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 5202 Register VAddr; 5203 Register RSrcReg; 5204 Register SOffset; 5205 int64_t Offset = 0; 5206 5207 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 5208 return {}; 5209 5210 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 5211 // pattern. 5212 return {{ 5213 [=](MachineInstrBuilder &MIB) { // rsrc 5214 MIB.addReg(RSrcReg); 5215 }, 5216 [=](MachineInstrBuilder &MIB) { // vaddr 5217 MIB.addReg(VAddr); 5218 }, 5219 [=](MachineInstrBuilder &MIB) { // soffset 5220 if (SOffset) 5221 MIB.addReg(SOffset); 5222 else if (STI.hasRestrictedSOffset()) 5223 MIB.addReg(AMDGPU::SGPR_NULL); 5224 else 5225 MIB.addImm(0); 5226 }, 5227 [=](MachineInstrBuilder &MIB) { // offset 5228 MIB.addImm(Offset); 5229 }, 5230 addZeroImm, // cpol 5231 addZeroImm, // tfe 5232 addZeroImm // swz 5233 }}; 5234 } 5235 5236 InstructionSelector::ComplexRendererFns 5237 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 5238 Register RSrcReg; 5239 Register SOffset; 5240 int64_t Offset = 0; 5241 5242 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 5243 return {}; 5244 5245 return {{ 5246 [=](MachineInstrBuilder &MIB) { // rsrc 5247 MIB.addReg(RSrcReg); 5248 }, 5249 [=](MachineInstrBuilder &MIB) { // soffset 5250 if (SOffset) 5251 MIB.addReg(SOffset); 5252 else if (STI.hasRestrictedSOffset()) 5253 MIB.addReg(AMDGPU::SGPR_NULL); 5254 else 5255 MIB.addImm(0); 5256 }, 5257 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 5258 addZeroImm, // cpol 5259 addZeroImm, // tfe 5260 addZeroImm, // swz 5261 }}; 5262 } 5263 5264 InstructionSelector::ComplexRendererFns 5265 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const { 5266 5267 Register SOffset = Root.getReg(); 5268 5269 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt())) 5270 SOffset = AMDGPU::SGPR_NULL; 5271 5272 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; 5273 } 5274 5275 /// Get an immediate that must be 32-bits, and treated as zero extended. 5276 static std::optional<uint64_t> 5277 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) { 5278 // getIConstantVRegVal sexts any values, so see if that matters. 5279 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI); 5280 if (!OffsetVal || !isInt<32>(*OffsetVal)) 5281 return std::nullopt; 5282 return Lo_32(*OffsetVal); 5283 } 5284 5285 InstructionSelector::ComplexRendererFns 5286 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 5287 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 5288 if (!OffsetVal) 5289 return {}; 5290 5291 std::optional<int64_t> EncodedImm = 5292 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 5293 if (!EncodedImm) 5294 return {}; 5295 5296 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 5297 } 5298 5299 InstructionSelector::ComplexRendererFns 5300 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 5301 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 5302 5303 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 5304 if (!OffsetVal) 5305 return {}; 5306 5307 std::optional<int64_t> EncodedImm = 5308 AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 5309 if (!EncodedImm) 5310 return {}; 5311 5312 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 5313 } 5314 5315 InstructionSelector::ComplexRendererFns 5316 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { 5317 // Match the (soffset + offset) pair as a 32-bit register base and 5318 // an immediate offset. 5319 Register SOffset; 5320 unsigned Offset; 5321 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset( 5322 *MRI, Root.getReg(), KB, /*CheckNUW*/ true); 5323 if (!SOffset) 5324 return std::nullopt; 5325 5326 std::optional<int64_t> EncodedOffset = 5327 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true); 5328 if (!EncodedOffset) 5329 return std::nullopt; 5330 5331 assert(MRI->getType(SOffset) == LLT::scalar(32)); 5332 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, 5333 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; 5334 } 5335 5336 // Variant of stripBitCast that returns the instruction instead of a 5337 // MachineOperand. 5338 static MachineInstr *stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI) { 5339 if (MI->getOpcode() == AMDGPU::G_BITCAST) 5340 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI); 5341 return MI; 5342 } 5343 5344 // Figure out if this is really an extract of the high 16-bits of a dword, 5345 // returns nullptr if it isn't. 5346 static MachineInstr *isExtractHiElt(MachineInstr *Inst, 5347 MachineRegisterInfo &MRI) { 5348 Inst = stripBitCast(Inst, MRI); 5349 5350 if (Inst->getOpcode() != AMDGPU::G_TRUNC) 5351 return nullptr; 5352 5353 MachineInstr *TruncOp = 5354 getDefIgnoringCopies(Inst->getOperand(1).getReg(), MRI); 5355 TruncOp = stripBitCast(TruncOp, MRI); 5356 5357 // G_LSHR x, (G_CONSTANT i32 16) 5358 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) { 5359 auto SrlAmount = getIConstantVRegValWithLookThrough( 5360 TruncOp->getOperand(2).getReg(), MRI); 5361 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) { 5362 MachineInstr *SrlOp = 5363 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); 5364 return stripBitCast(SrlOp, MRI); 5365 } 5366 } 5367 5368 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0) 5369 // 1, 0 swaps the low/high 16 bits. 5370 // 1, 1 sets the high 16 bits to be the same as the low 16. 5371 // in any case, it selects the high elts. 5372 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { 5373 assert(MRI.getType(TruncOp->getOperand(0).getReg()) == 5374 LLT::fixed_vector(2, 16)); 5375 5376 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask(); 5377 assert(Mask.size() == 2); 5378 5379 if (Mask[0] == 1 && Mask[1] <= 1) { 5380 MachineInstr *LHS = 5381 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI); 5382 return stripBitCast(LHS, MRI); 5383 } 5384 } 5385 5386 return nullptr; 5387 } 5388 5389 std::pair<Register, unsigned> 5390 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, 5391 bool &Matched) const { 5392 Matched = false; 5393 5394 Register Src; 5395 unsigned Mods; 5396 std::tie(Src, Mods) = selectVOP3ModsImpl(Root); 5397 5398 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 5399 if (MI->getOpcode() == AMDGPU::G_FPEXT) { 5400 MachineOperand *MO = &MI->getOperand(1); 5401 Src = MO->getReg(); 5402 MI = getDefIgnoringCopies(Src, *MRI); 5403 5404 assert(MRI->getType(Src) == LLT::scalar(16)); 5405 5406 // See through bitcasts. 5407 // FIXME: Would be nice to use stripBitCast here. 5408 if (MI->getOpcode() == AMDGPU::G_BITCAST) { 5409 MO = &MI->getOperand(1); 5410 Src = MO->getReg(); 5411 MI = getDefIgnoringCopies(Src, *MRI); 5412 } 5413 5414 const auto CheckAbsNeg = [&]() { 5415 // Be careful about folding modifiers if we already have an abs. fneg is 5416 // applied last, so we don't want to apply an earlier fneg. 5417 if ((Mods & SISrcMods::ABS) == 0) { 5418 unsigned ModsTmp; 5419 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO); 5420 MI = getDefIgnoringCopies(Src, *MRI); 5421 5422 if ((ModsTmp & SISrcMods::NEG) != 0) 5423 Mods ^= SISrcMods::NEG; 5424 5425 if ((ModsTmp & SISrcMods::ABS) != 0) 5426 Mods |= SISrcMods::ABS; 5427 } 5428 }; 5429 5430 CheckAbsNeg(); 5431 5432 // op_sel/op_sel_hi decide the source type and source. 5433 // If the source's op_sel_hi is set, it indicates to do a conversion from 5434 // fp16. If the sources's op_sel is set, it picks the high half of the 5435 // source register. 5436 5437 Mods |= SISrcMods::OP_SEL_1; 5438 5439 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) { 5440 Mods |= SISrcMods::OP_SEL_0; 5441 MI = ExtractHiEltMI; 5442 MO = &MI->getOperand(0); 5443 Src = MO->getReg(); 5444 5445 CheckAbsNeg(); 5446 } 5447 5448 Matched = true; 5449 } 5450 5451 return {Src, Mods}; 5452 } 5453 5454 InstructionSelector::ComplexRendererFns 5455 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt( 5456 MachineOperand &Root) const { 5457 Register Src; 5458 unsigned Mods; 5459 bool Matched; 5460 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); 5461 if (!Matched) 5462 return {}; 5463 5464 return {{ 5465 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5466 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 5467 }}; 5468 } 5469 5470 InstructionSelector::ComplexRendererFns 5471 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { 5472 Register Src; 5473 unsigned Mods; 5474 bool Matched; 5475 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); 5476 5477 return {{ 5478 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5479 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 5480 }}; 5481 } 5482 5483 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( 5484 MachineInstr &I, Intrinsic::ID IntrID) const { 5485 MachineBasicBlock *MBB = I.getParent(); 5486 const DebugLoc &DL = I.getDebugLoc(); 5487 Register CCReg = I.getOperand(0).getReg(); 5488 5489 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var; 5490 5491 if (HasM0) { 5492 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 5493 .addReg(I.getOperand(2).getReg()); 5494 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0)); 5495 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI)) 5496 return false; 5497 } else { 5498 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) 5499 .addImm(I.getOperand(2).getImm()); 5500 } 5501 5502 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); 5503 5504 I.eraseFromParent(); 5505 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, 5506 *MRI); 5507 } 5508 5509 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { 5510 if (HasInlineConst) { 5511 switch (IntrID) { 5512 default: 5513 llvm_unreachable("not a named barrier op"); 5514 case Intrinsic::amdgcn_s_barrier_init: 5515 return AMDGPU::S_BARRIER_INIT_IMM; 5516 case Intrinsic::amdgcn_s_barrier_join: 5517 return AMDGPU::S_BARRIER_JOIN_IMM; 5518 case Intrinsic::amdgcn_s_wakeup_barrier: 5519 return AMDGPU::S_WAKEUP_BARRIER_IMM; 5520 case Intrinsic::amdgcn_s_get_barrier_state: 5521 return AMDGPU::S_GET_BARRIER_STATE_IMM; 5522 }; 5523 } else { 5524 switch (IntrID) { 5525 default: 5526 llvm_unreachable("not a named barrier op"); 5527 case Intrinsic::amdgcn_s_barrier_init: 5528 return AMDGPU::S_BARRIER_INIT_M0; 5529 case Intrinsic::amdgcn_s_barrier_join: 5530 return AMDGPU::S_BARRIER_JOIN_M0; 5531 case Intrinsic::amdgcn_s_wakeup_barrier: 5532 return AMDGPU::S_WAKEUP_BARRIER_M0; 5533 case Intrinsic::amdgcn_s_get_barrier_state: 5534 return AMDGPU::S_GET_BARRIER_STATE_M0; 5535 }; 5536 } 5537 } 5538 5539 bool AMDGPUInstructionSelector::selectNamedBarrierInst( 5540 MachineInstr &I, Intrinsic::ID IntrID) const { 5541 MachineBasicBlock *MBB = I.getParent(); 5542 const DebugLoc &DL = I.getDebugLoc(); 5543 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state 5544 ? I.getOperand(2) 5545 : I.getOperand(1); 5546 std::optional<int64_t> BarValImm = 5547 getIConstantVRegSExtVal(BarOp.getReg(), *MRI); 5548 Register M0Val; 5549 Register TmpReg0; 5550 5551 // For S_BARRIER_INIT, member count will always be read from M0[16:22] 5552 if (IntrID == Intrinsic::amdgcn_s_barrier_init) { 5553 Register MemberCount = I.getOperand(2).getReg(); 5554 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5555 // TODO: This should be expanded during legalization so that the the S_LSHL 5556 // and S_OR can be constant-folded 5557 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 5558 .addImm(16) 5559 .addReg(MemberCount); 5560 M0Val = TmpReg0; 5561 } 5562 5563 // If not inlinable, get reference to barrier depending on the instruction 5564 if (!BarValImm) { 5565 if (IntrID == Intrinsic::amdgcn_s_barrier_init) { 5566 // If reference to barrier id is not an inlinable constant then it must be 5567 // referenced with M0[4:0]. Perform an OR with the member count to include 5568 // it in M0 for S_BARRIER_INIT. 5569 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5570 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1) 5571 .addReg(BarOp.getReg()) 5572 .addReg(TmpReg0); 5573 M0Val = TmpReg1; 5574 } else { 5575 M0Val = BarOp.getReg(); 5576 } 5577 } 5578 5579 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required. 5580 if (M0Val) { 5581 auto CopyMIB = 5582 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val); 5583 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); 5584 } 5585 5586 MachineInstrBuilder MIB; 5587 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID); 5588 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); 5589 5590 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state) 5591 MIB.addDef(I.getOperand(0).getReg()); 5592 5593 if (BarValImm) 5594 MIB.addImm(*BarValImm); 5595 5596 I.eraseFromParent(); 5597 return true; 5598 } 5599 5600 bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const { 5601 MachineBasicBlock *BB = I.getParent(); 5602 const DebugLoc &DL = I.getDebugLoc(); 5603 Register CCReg = I.getOperand(0).getReg(); 5604 5605 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE)); 5606 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); 5607 5608 I.eraseFromParent(); 5609 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, 5610 *MRI); 5611 } 5612 5613 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 5614 const MachineInstr &MI, 5615 int OpIdx) const { 5616 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 5617 "Expected G_CONSTANT"); 5618 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 5619 } 5620 5621 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 5622 const MachineInstr &MI, 5623 int OpIdx) const { 5624 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 5625 "Expected G_CONSTANT"); 5626 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 5627 } 5628 5629 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB, 5630 const MachineInstr &MI, 5631 int OpIdx) const { 5632 assert(OpIdx == -1); 5633 5634 const MachineOperand &Op = MI.getOperand(1); 5635 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT) 5636 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 5637 else { 5638 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); 5639 MIB.addImm(Op.getCImm()->getSExtValue()); 5640 } 5641 } 5642 5643 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 5644 const MachineInstr &MI, 5645 int OpIdx) const { 5646 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 5647 "Expected G_CONSTANT"); 5648 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount()); 5649 } 5650 5651 /// This only really exists to satisfy DAG type checking machinery, so is a 5652 /// no-op here. 5653 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 5654 const MachineInstr &MI, 5655 int OpIdx) const { 5656 MIB.addImm(MI.getOperand(OpIdx).getImm()); 5657 } 5658 5659 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB, 5660 const MachineInstr &MI, 5661 int OpIdx) const { 5662 assert(OpIdx >= 0 && "expected to match an immediate operand"); 5663 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0); 5664 } 5665 5666 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, 5667 const MachineInstr &MI, 5668 int OpIdx) const { 5669 assert(OpIdx >= 0 && "expected to match an immediate operand"); 5670 MIB.addImm(MI.getOperand(OpIdx).getImm() & 5671 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL 5672 : AMDGPU::CPol::ALL_pregfx12)); 5673 } 5674 5675 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 5676 const MachineInstr &MI, 5677 int OpIdx) const { 5678 assert(OpIdx >= 0 && "expected to match an immediate operand"); 5679 const bool Swizzle = MI.getOperand(OpIdx).getImm() & 5680 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ 5681 : AMDGPU::CPol::SWZ_pregfx12); 5682 MIB.addImm(Swizzle); 5683 } 5684 5685 void AMDGPUInstructionSelector::renderExtractCpolSetGLC( 5686 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 5687 assert(OpIdx >= 0 && "expected to match an immediate operand"); 5688 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() & 5689 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL 5690 : AMDGPU::CPol::ALL_pregfx12); 5691 MIB.addImm(Cpol | AMDGPU::CPol::GLC); 5692 } 5693 5694 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 5695 const MachineInstr &MI, 5696 int OpIdx) const { 5697 MIB.addFrameIndex(MI.getOperand(1).getIndex()); 5698 } 5699 5700 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB, 5701 const MachineInstr &MI, 5702 int OpIdx) const { 5703 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF(); 5704 int ExpVal = APF.getExactLog2Abs(); 5705 assert(ExpVal != INT_MIN); 5706 MIB.addImm(ExpVal); 5707 } 5708 5709 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const { 5710 return TII.isInlineConstant(Imm); 5711 } 5712 5713 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 5714 return TII.isInlineConstant(Imm); 5715 } 5716