1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the InstructionSelector class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUInstructionSelector.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUGlobalISelUtils.h" 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 23 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h" 24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/MachineFrameInfo.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/IntrinsicsAMDGPU.h" 30 #include <optional> 31 32 #define DEBUG_TYPE "amdgpu-isel" 33 34 using namespace llvm; 35 using namespace MIPatternMatch; 36 37 #define GET_GLOBALISEL_IMPL 38 #define AMDGPUSubtarget GCNSubtarget 39 #include "AMDGPUGenGlobalISel.inc" 40 #undef GET_GLOBALISEL_IMPL 41 #undef AMDGPUSubtarget 42 43 AMDGPUInstructionSelector::AMDGPUInstructionSelector( 44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, 45 const AMDGPUTargetMachine &TM) 46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), 47 STI(STI), 48 #define GET_GLOBALISEL_PREDICATES_INIT 49 #include "AMDGPUGenGlobalISel.inc" 50 #undef GET_GLOBALISEL_PREDICATES_INIT 51 #define GET_GLOBALISEL_TEMPORARIES_INIT 52 #include "AMDGPUGenGlobalISel.inc" 53 #undef GET_GLOBALISEL_TEMPORARIES_INIT 54 { 55 } 56 57 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } 58 59 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, 60 GISelValueTracking *VT, 61 CodeGenCoverage *CoverageInfo, 62 ProfileSummaryInfo *PSI, 63 BlockFrequencyInfo *BFI) { 64 MRI = &MF.getRegInfo(); 65 Subtarget = &MF.getSubtarget<GCNSubtarget>(); 66 Subtarget->checkSubtargetFeatures(MF.getFunction()); 67 InstructionSelector::setupMF(MF, VT, CoverageInfo, PSI, BFI); 68 } 69 70 // Return the wave level SGPR base address if this is a wave address. 71 static Register getWaveAddress(const MachineInstr *Def) { 72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS 73 ? Def->getOperand(1).getReg() 74 : Register(); 75 } 76 77 bool AMDGPUInstructionSelector::isVCC(Register Reg, 78 const MachineRegisterInfo &MRI) const { 79 // The verifier is oblivious to s1 being a valid value for wavesize registers. 80 if (Reg.isPhysical()) 81 return false; 82 83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 84 const TargetRegisterClass *RC = 85 dyn_cast<const TargetRegisterClass *>(RegClassOrBank); 86 if (RC) { 87 const LLT Ty = MRI.getType(Reg); 88 if (!Ty.isValid() || Ty.getSizeInBits() != 1) 89 return false; 90 // G_TRUNC s1 result is never vcc. 91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC && 92 RC->hasSuperClassEq(TRI.getBoolRC()); 93 } 94 95 const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank); 96 return RB->getID() == AMDGPU::VCCRegBankID; 97 } 98 99 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, 100 unsigned NewOpc) const { 101 MI.setDesc(TII.get(NewOpc)); 102 MI.removeOperand(1); // Remove intrinsic ID. 103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 104 105 MachineOperand &Dst = MI.getOperand(0); 106 MachineOperand &Src = MI.getOperand(1); 107 108 // TODO: This should be legalized to s32 if needed 109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1)) 110 return false; 111 112 const TargetRegisterClass *DstRC 113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 114 const TargetRegisterClass *SrcRC 115 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 116 if (!DstRC || DstRC != SrcRC) 117 return false; 118 119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && 120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); 121 } 122 123 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { 124 const DebugLoc &DL = I.getDebugLoc(); 125 MachineBasicBlock *BB = I.getParent(); 126 I.setDesc(TII.get(TargetOpcode::COPY)); 127 128 const MachineOperand &Src = I.getOperand(1); 129 MachineOperand &Dst = I.getOperand(0); 130 Register DstReg = Dst.getReg(); 131 Register SrcReg = Src.getReg(); 132 133 if (isVCC(DstReg, *MRI)) { 134 if (SrcReg == AMDGPU::SCC) { 135 const TargetRegisterClass *RC 136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 137 if (!RC) 138 return true; 139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI); 140 } 141 142 if (!isVCC(SrcReg, *MRI)) { 143 // TODO: Should probably leave the copy and let copyPhysReg expand it. 144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI)) 145 return false; 146 147 const TargetRegisterClass *SrcRC 148 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 149 150 std::optional<ValueAndVReg> ConstVal = 151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true); 152 if (ConstVal) { 153 unsigned MovOpc = 154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg) 156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0); 157 } else { 158 Register MaskedReg = MRI->createVirtualRegister(SrcRC); 159 160 // We can't trust the high bits at this point, so clear them. 161 162 // TODO: Skip masking high bits if def is known boolean. 163 164 if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) { 165 assert(Subtarget->useRealTrue16Insts()); 166 const int64_t NoMods = 0; 167 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg) 168 .addImm(NoMods) 169 .addImm(1) 170 .addImm(NoMods) 171 .addReg(SrcReg) 172 .addImm(NoMods); 173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg) 174 .addImm(NoMods) 175 .addImm(0) 176 .addImm(NoMods) 177 .addReg(MaskedReg) 178 .addImm(NoMods); 179 } else { 180 bool IsSGPR = TRI.isSGPRClass(SrcRC); 181 unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32; 182 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg) 183 .addImm(1) 184 .addReg(SrcReg); 185 if (IsSGPR) 186 And.setOperandDead(3); // Dead scc 187 188 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg) 189 .addImm(0) 190 .addReg(MaskedReg); 191 } 192 } 193 194 if (!MRI->getRegClassOrNull(SrcReg)) 195 MRI->setRegClass(SrcReg, SrcRC); 196 I.eraseFromParent(); 197 return true; 198 } 199 200 const TargetRegisterClass *RC = 201 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 202 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 203 return false; 204 205 return true; 206 } 207 208 for (const MachineOperand &MO : I.operands()) { 209 if (MO.getReg().isPhysical()) 210 continue; 211 212 const TargetRegisterClass *RC = 213 TRI.getConstrainedRegClassForOperand(MO, *MRI); 214 if (!RC) 215 continue; 216 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI); 217 } 218 return true; 219 } 220 221 bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { 222 const DebugLoc &DL = I.getDebugLoc(); 223 MachineBasicBlock *BB = I.getParent(); 224 225 unsigned CmpOpc = 226 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32; 227 MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc)) 228 .addReg(I.getOperand(1).getReg()) 229 .addImm(0); 230 if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI)) 231 return false; 232 233 Register DstReg = I.getOperand(0).getReg(); 234 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC); 235 236 I.eraseFromParent(); 237 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 238 } 239 240 bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const { 241 const DebugLoc &DL = I.getDebugLoc(); 242 MachineBasicBlock *BB = I.getParent(); 243 244 Register DstReg = I.getOperand(0).getReg(); 245 Register SrcReg = I.getOperand(1).getReg(); 246 std::optional<ValueAndVReg> Arg = 247 getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI); 248 249 if (Arg) { 250 const int64_t Value = Arg->Value.getZExtValue(); 251 if (Value == 0) { 252 unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 253 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); 254 } else { 255 assert(Value == 1); 256 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec()); 257 } 258 I.eraseFromParent(); 259 return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI); 260 } 261 262 // RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0). 263 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg); 264 265 unsigned SelectOpcode = 266 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; 267 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 268 .addReg(TRI.getExec()) 269 .addImm(0); 270 271 I.eraseFromParent(); 272 return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 273 } 274 275 bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const { 276 Register DstReg = I.getOperand(0).getReg(); 277 Register SrcReg = I.getOperand(1).getReg(); 278 279 const DebugLoc &DL = I.getDebugLoc(); 280 MachineBasicBlock *BB = I.getParent(); 281 282 auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 283 .addReg(SrcReg); 284 285 I.eraseFromParent(); 286 return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI); 287 } 288 289 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { 290 const Register DefReg = I.getOperand(0).getReg(); 291 const LLT DefTy = MRI->getType(DefReg); 292 293 // S1 G_PHIs should not be selected in instruction-select, instead: 294 // - divergent S1 G_PHI should go through lane mask merging algorithm 295 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering 296 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect 297 if (DefTy == LLT::scalar(1)) 298 return false; 299 300 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 301 302 const RegClassOrRegBank &RegClassOrBank = 303 MRI->getRegClassOrRegBank(DefReg); 304 305 const TargetRegisterClass *DefRC = 306 dyn_cast<const TargetRegisterClass *>(RegClassOrBank); 307 if (!DefRC) { 308 if (!DefTy.isValid()) { 309 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); 310 return false; 311 } 312 313 const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank); 314 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB); 315 if (!DefRC) { 316 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); 317 return false; 318 } 319 } 320 321 // If inputs have register bank, assign corresponding reg class. 322 // Note: registers don't need to have the same reg bank. 323 for (unsigned i = 1; i != I.getNumOperands(); i += 2) { 324 const Register SrcReg = I.getOperand(i).getReg(); 325 326 const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg); 327 if (RB) { 328 const LLT SrcTy = MRI->getType(SrcReg); 329 const TargetRegisterClass *SrcRC = 330 TRI.getRegClassForTypeOnBank(SrcTy, *RB); 331 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 332 return false; 333 } 334 } 335 336 I.setDesc(TII.get(TargetOpcode::PHI)); 337 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI); 338 } 339 340 MachineOperand 341 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, 342 const TargetRegisterClass &SubRC, 343 unsigned SubIdx) const { 344 345 MachineInstr *MI = MO.getParent(); 346 MachineBasicBlock *BB = MO.getParent()->getParent(); 347 Register DstReg = MRI->createVirtualRegister(&SubRC); 348 349 if (MO.isReg()) { 350 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx); 351 Register Reg = MO.getReg(); 352 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 353 .addReg(Reg, 0, ComposedSubIdx); 354 355 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(), 356 MO.isKill(), MO.isDead(), MO.isUndef(), 357 MO.isEarlyClobber(), 0, MO.isDebug(), 358 MO.isInternalRead()); 359 } 360 361 assert(MO.isImm()); 362 363 APInt Imm(64, MO.getImm()); 364 365 switch (SubIdx) { 366 default: 367 llvm_unreachable("do not know to split immediate with this sub index."); 368 case AMDGPU::sub0: 369 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue()); 370 case AMDGPU::sub1: 371 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue()); 372 } 373 } 374 375 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) { 376 switch (Opc) { 377 case AMDGPU::G_AND: 378 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 379 case AMDGPU::G_OR: 380 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32; 381 case AMDGPU::G_XOR: 382 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32; 383 default: 384 llvm_unreachable("not a bit op"); 385 } 386 } 387 388 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const { 389 Register DstReg = I.getOperand(0).getReg(); 390 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 391 392 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 393 if (DstRB->getID() != AMDGPU::SGPRRegBankID && 394 DstRB->getID() != AMDGPU::VCCRegBankID) 395 return false; 396 397 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID && 398 STI.isWave64()); 399 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64))); 400 401 // Dead implicit-def of scc 402 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef 403 true, // isImp 404 false, // isKill 405 true)); // isDead 406 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 407 } 408 409 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const { 410 MachineBasicBlock *BB = I.getParent(); 411 MachineFunction *MF = BB->getParent(); 412 Register DstReg = I.getOperand(0).getReg(); 413 const DebugLoc &DL = I.getDebugLoc(); 414 LLT Ty = MRI->getType(DstReg); 415 if (Ty.isVector()) 416 return false; 417 418 unsigned Size = Ty.getSizeInBits(); 419 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 420 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID; 421 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB; 422 423 if (Size == 32) { 424 if (IsSALU) { 425 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; 426 MachineInstr *Add = 427 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 428 .add(I.getOperand(1)) 429 .add(I.getOperand(2)) 430 .setOperandDead(3); // Dead scc 431 I.eraseFromParent(); 432 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 433 } 434 435 if (STI.hasAddNoCarry()) { 436 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64; 437 I.setDesc(TII.get(Opc)); 438 I.addOperand(*MF, MachineOperand::CreateImm(0)); 439 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 440 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 441 } 442 443 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64; 444 445 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass()); 446 MachineInstr *Add 447 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg) 448 .addDef(UnusedCarry, RegState::Dead) 449 .add(I.getOperand(1)) 450 .add(I.getOperand(2)) 451 .addImm(0); 452 I.eraseFromParent(); 453 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI); 454 } 455 456 assert(!Sub && "illegal sub should not reach here"); 457 458 const TargetRegisterClass &RC 459 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass; 460 const TargetRegisterClass &HalfRC 461 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass; 462 463 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0)); 464 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0)); 465 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1)); 466 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1)); 467 468 Register DstLo = MRI->createVirtualRegister(&HalfRC); 469 Register DstHi = MRI->createVirtualRegister(&HalfRC); 470 471 if (IsSALU) { 472 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo) 473 .add(Lo1) 474 .add(Lo2); 475 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi) 476 .add(Hi1) 477 .add(Hi2) 478 .setOperandDead(3); // Dead scc 479 } else { 480 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass(); 481 Register CarryReg = MRI->createVirtualRegister(CarryRC); 482 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo) 483 .addDef(CarryReg) 484 .add(Lo1) 485 .add(Lo2) 486 .addImm(0); 487 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi) 488 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead) 489 .add(Hi1) 490 .add(Hi2) 491 .addReg(CarryReg, RegState::Kill) 492 .addImm(0); 493 494 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI)) 495 return false; 496 } 497 498 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 499 .addReg(DstLo) 500 .addImm(AMDGPU::sub0) 501 .addReg(DstHi) 502 .addImm(AMDGPU::sub1); 503 504 505 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 506 return false; 507 508 I.eraseFromParent(); 509 return true; 510 } 511 512 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE( 513 MachineInstr &I) const { 514 MachineBasicBlock *BB = I.getParent(); 515 MachineFunction *MF = BB->getParent(); 516 const DebugLoc &DL = I.getDebugLoc(); 517 Register Dst0Reg = I.getOperand(0).getReg(); 518 Register Dst1Reg = I.getOperand(1).getReg(); 519 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO || 520 I.getOpcode() == AMDGPU::G_UADDE; 521 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE || 522 I.getOpcode() == AMDGPU::G_USUBE; 523 524 if (isVCC(Dst1Reg, *MRI)) { 525 unsigned NoCarryOpc = 526 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 527 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 528 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc)); 529 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 530 I.addOperand(*MF, MachineOperand::CreateImm(0)); 531 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 532 } 533 534 Register Src0Reg = I.getOperand(2).getReg(); 535 Register Src1Reg = I.getOperand(3).getReg(); 536 537 if (HasCarryIn) { 538 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 539 .addReg(I.getOperand(4).getReg()); 540 } 541 542 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; 543 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; 544 545 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg) 546 .add(I.getOperand(2)) 547 .add(I.getOperand(3)); 548 549 if (MRI->use_nodbg_empty(Dst1Reg)) { 550 CarryInst.setOperandDead(3); // Dead scc 551 } else { 552 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg) 553 .addReg(AMDGPU::SCC); 554 if (!MRI->getRegClassOrNull(Dst1Reg)) 555 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass); 556 } 557 558 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) || 559 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) || 560 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI)) 561 return false; 562 563 if (HasCarryIn && 564 !RBI.constrainGenericRegister(I.getOperand(4).getReg(), 565 AMDGPU::SReg_32RegClass, *MRI)) 566 return false; 567 568 I.eraseFromParent(); 569 return true; 570 } 571 572 bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( 573 MachineInstr &I) const { 574 MachineBasicBlock *BB = I.getParent(); 575 MachineFunction *MF = BB->getParent(); 576 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32; 577 578 unsigned Opc; 579 if (Subtarget->hasMADIntraFwdBug()) 580 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64 581 : AMDGPU::V_MAD_I64_I32_gfx11_e64; 582 else 583 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64; 584 I.setDesc(TII.get(Opc)); 585 I.addOperand(*MF, MachineOperand::CreateImm(0)); 586 I.addImplicitDefUseOperands(*MF); 587 return constrainSelectedInstRegOperands(I, TII, TRI, RBI); 588 } 589 590 // TODO: We should probably legalize these to only using 32-bit results. 591 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { 592 MachineBasicBlock *BB = I.getParent(); 593 Register DstReg = I.getOperand(0).getReg(); 594 Register SrcReg = I.getOperand(1).getReg(); 595 LLT DstTy = MRI->getType(DstReg); 596 LLT SrcTy = MRI->getType(SrcReg); 597 const unsigned SrcSize = SrcTy.getSizeInBits(); 598 unsigned DstSize = DstTy.getSizeInBits(); 599 600 // TODO: Should handle any multiple of 32 offset. 601 unsigned Offset = I.getOperand(2).getImm(); 602 if (Offset % 32 != 0 || DstSize > 128) 603 return false; 604 605 // 16-bit operations really use 32-bit registers. 606 // FIXME: Probably should not allow 16-bit G_EXTRACT results. 607 if (DstSize == 16) 608 DstSize = 32; 609 610 const TargetRegisterClass *DstRC = 611 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 612 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 613 return false; 614 615 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 616 const TargetRegisterClass *SrcRC = 617 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 618 if (!SrcRC) 619 return false; 620 unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32, 621 DstSize / 32); 622 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg); 623 if (!SrcRC) 624 return false; 625 626 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I, 627 *SrcRC, I.getOperand(1)); 628 const DebugLoc &DL = I.getDebugLoc(); 629 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg) 630 .addReg(SrcReg, 0, SubReg); 631 632 I.eraseFromParent(); 633 return true; 634 } 635 636 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { 637 MachineBasicBlock *BB = MI.getParent(); 638 Register DstReg = MI.getOperand(0).getReg(); 639 LLT DstTy = MRI->getType(DstReg); 640 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg()); 641 642 const unsigned SrcSize = SrcTy.getSizeInBits(); 643 if (SrcSize < 32) 644 return selectImpl(MI, *CoverageInfo); 645 646 const DebugLoc &DL = MI.getDebugLoc(); 647 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 648 const unsigned DstSize = DstTy.getSizeInBits(); 649 const TargetRegisterClass *DstRC = 650 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 651 if (!DstRC) 652 return false; 653 654 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8); 655 MachineInstrBuilder MIB = 656 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg); 657 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) { 658 MachineOperand &Src = MI.getOperand(I + 1); 659 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef())); 660 MIB.addImm(SubRegs[I]); 661 662 const TargetRegisterClass *SrcRC 663 = TRI.getConstrainedRegClassForOperand(Src, *MRI); 664 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) 665 return false; 666 } 667 668 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 669 return false; 670 671 MI.eraseFromParent(); 672 return true; 673 } 674 675 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { 676 MachineBasicBlock *BB = MI.getParent(); 677 const int NumDst = MI.getNumOperands() - 1; 678 679 MachineOperand &Src = MI.getOperand(NumDst); 680 681 Register SrcReg = Src.getReg(); 682 Register DstReg0 = MI.getOperand(0).getReg(); 683 LLT DstTy = MRI->getType(DstReg0); 684 LLT SrcTy = MRI->getType(SrcReg); 685 686 const unsigned DstSize = DstTy.getSizeInBits(); 687 const unsigned SrcSize = SrcTy.getSizeInBits(); 688 const DebugLoc &DL = MI.getDebugLoc(); 689 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI); 690 691 const TargetRegisterClass *SrcRC = 692 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank); 693 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 694 return false; 695 696 // Note we could have mixed SGPR and VGPR destination banks for an SGPR 697 // source, and this relies on the fact that the same subregister indices are 698 // used for both. 699 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8); 700 for (int I = 0, E = NumDst; I != E; ++I) { 701 MachineOperand &Dst = MI.getOperand(I); 702 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg()) 703 .addReg(SrcReg, 0, SubRegs[I]); 704 705 // Make sure the subregister index is valid for the source register. 706 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]); 707 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI)) 708 return false; 709 710 const TargetRegisterClass *DstRC = 711 TRI.getConstrainedRegClassForOperand(Dst, *MRI); 712 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI)) 713 return false; 714 } 715 716 MI.eraseFromParent(); 717 return true; 718 } 719 720 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const { 721 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC || 722 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR); 723 724 Register Src0 = MI.getOperand(1).getReg(); 725 Register Src1 = MI.getOperand(2).getReg(); 726 LLT SrcTy = MRI->getType(Src0); 727 const unsigned SrcSize = SrcTy.getSizeInBits(); 728 729 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE. 730 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) { 731 return selectG_MERGE_VALUES(MI); 732 } 733 734 // Selection logic below is for V2S16 only. 735 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32. 736 Register Dst = MI.getOperand(0).getReg(); 737 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) || 738 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC && 739 SrcTy != LLT::scalar(32))) 740 return selectImpl(MI, *CoverageInfo); 741 742 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI); 743 if (DstBank->getID() == AMDGPU::AGPRRegBankID) 744 return false; 745 746 assert(DstBank->getID() == AMDGPU::SGPRRegBankID || 747 DstBank->getID() == AMDGPU::VGPRRegBankID); 748 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID; 749 750 const DebugLoc &DL = MI.getDebugLoc(); 751 MachineBasicBlock *BB = MI.getParent(); 752 753 // First, before trying TableGen patterns, check if both sources are 754 // constants. In those cases, we can trivially compute the final constant 755 // and emit a simple move. 756 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); 757 if (ConstSrc1) { 758 auto ConstSrc0 = 759 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true); 760 if (ConstSrc0) { 761 const int64_t K0 = ConstSrc0->Value.getSExtValue(); 762 const int64_t K1 = ConstSrc1->Value.getSExtValue(); 763 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff; 764 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff; 765 uint32_t Imm = Lo16 | (Hi16 << 16); 766 767 // VALU 768 if (IsVector) { 769 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm); 770 MI.eraseFromParent(); 771 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI); 772 } 773 774 // SALU 775 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm); 776 MI.eraseFromParent(); 777 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 778 } 779 } 780 781 // Now try TableGen patterns. 782 if (selectImpl(MI, *CoverageInfo)) 783 return true; 784 785 // TODO: This should probably be a combine somewhere 786 // (build_vector $src0, undef) -> copy $src0 787 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); 788 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) { 789 MI.setDesc(TII.get(AMDGPU::COPY)); 790 MI.removeOperand(2); 791 const auto &RC = 792 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 793 return RBI.constrainGenericRegister(Dst, RC, *MRI) && 794 RBI.constrainGenericRegister(Src0, RC, *MRI); 795 } 796 797 // TODO: Can be improved? 798 if (IsVector) { 799 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 800 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg) 801 .addImm(0xFFFF) 802 .addReg(Src0); 803 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) 804 return false; 805 806 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst) 807 .addReg(Src1) 808 .addImm(16) 809 .addReg(TmpReg); 810 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) 811 return false; 812 813 MI.eraseFromParent(); 814 return true; 815 } 816 817 Register ShiftSrc0; 818 Register ShiftSrc1; 819 820 // With multiple uses of the shift, this will duplicate the shift and 821 // increase register pressure. 822 // 823 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 824 // => (S_PACK_HH_B32_B16 $src0, $src1) 825 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1) 826 // => (S_PACK_HL_B32_B16 $src0, $src1) 827 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16)) 828 // => (S_PACK_LH_B32_B16 $src0, $src1) 829 // (build_vector $src0, $src1) 830 // => (S_PACK_LL_B32_B16 $src0, $src1) 831 832 bool Shift0 = mi_match( 833 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16)))); 834 835 bool Shift1 = mi_match( 836 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16)))); 837 838 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16; 839 if (Shift0 && Shift1) { 840 Opc = AMDGPU::S_PACK_HH_B32_B16; 841 MI.getOperand(1).setReg(ShiftSrc0); 842 MI.getOperand(2).setReg(ShiftSrc1); 843 } else if (Shift1) { 844 Opc = AMDGPU::S_PACK_LH_B32_B16; 845 MI.getOperand(2).setReg(ShiftSrc1); 846 } else if (Shift0) { 847 auto ConstSrc1 = 848 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true); 849 if (ConstSrc1 && ConstSrc1->Value == 0) { 850 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 851 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) 852 .addReg(ShiftSrc0) 853 .addImm(16) 854 .setOperandDead(3); // Dead scc 855 856 MI.eraseFromParent(); 857 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 858 } 859 if (STI.hasSPackHL()) { 860 Opc = AMDGPU::S_PACK_HL_B32_B16; 861 MI.getOperand(1).setReg(ShiftSrc0); 862 } 863 } 864 865 MI.setDesc(TII.get(Opc)); 866 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 867 } 868 869 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { 870 const MachineOperand &MO = I.getOperand(0); 871 872 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The 873 // regbank check here is to know why getConstrainedRegClassForOperand failed. 874 const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI); 875 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) || 876 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) { 877 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); 878 return true; 879 } 880 881 return false; 882 } 883 884 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { 885 MachineBasicBlock *BB = I.getParent(); 886 887 Register DstReg = I.getOperand(0).getReg(); 888 Register Src0Reg = I.getOperand(1).getReg(); 889 Register Src1Reg = I.getOperand(2).getReg(); 890 LLT Src1Ty = MRI->getType(Src1Reg); 891 892 unsigned DstSize = MRI->getType(DstReg).getSizeInBits(); 893 unsigned InsSize = Src1Ty.getSizeInBits(); 894 895 int64_t Offset = I.getOperand(3).getImm(); 896 897 // FIXME: These cases should have been illegal and unnecessary to check here. 898 if (Offset % 32 != 0 || InsSize % 32 != 0) 899 return false; 900 901 // Currently not handled by getSubRegFromChannel. 902 if (InsSize > 128) 903 return false; 904 905 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32); 906 if (SubReg == AMDGPU::NoSubRegister) 907 return false; 908 909 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 910 const TargetRegisterClass *DstRC = 911 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 912 if (!DstRC) 913 return false; 914 915 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI); 916 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI); 917 const TargetRegisterClass *Src0RC = 918 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank); 919 const TargetRegisterClass *Src1RC = 920 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank); 921 922 // Deal with weird cases where the class only partially supports the subreg 923 // index. 924 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg); 925 if (!Src0RC || !Src1RC) 926 return false; 927 928 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 929 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) || 930 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI)) 931 return false; 932 933 const DebugLoc &DL = I.getDebugLoc(); 934 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg) 935 .addReg(Src0Reg) 936 .addReg(Src1Reg) 937 .addImm(SubReg); 938 939 I.eraseFromParent(); 940 return true; 941 } 942 943 bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const { 944 Register DstReg = MI.getOperand(0).getReg(); 945 Register SrcReg = MI.getOperand(1).getReg(); 946 Register OffsetReg = MI.getOperand(2).getReg(); 947 Register WidthReg = MI.getOperand(3).getReg(); 948 949 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID && 950 "scalar BFX instructions are expanded in regbankselect"); 951 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 && 952 "64-bit vector BFX instructions are expanded in regbankselect"); 953 954 const DebugLoc &DL = MI.getDebugLoc(); 955 MachineBasicBlock *MBB = MI.getParent(); 956 957 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX; 958 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 959 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg) 960 .addReg(SrcReg) 961 .addReg(OffsetReg) 962 .addReg(WidthReg); 963 MI.eraseFromParent(); 964 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 965 } 966 967 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const { 968 if (STI.getLDSBankCount() != 16) 969 return selectImpl(MI, *CoverageInfo); 970 971 Register Dst = MI.getOperand(0).getReg(); 972 Register Src0 = MI.getOperand(2).getReg(); 973 Register M0Val = MI.getOperand(6).getReg(); 974 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) || 975 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) || 976 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI)) 977 return false; 978 979 // This requires 2 instructions. It is possible to write a pattern to support 980 // this, but the generated isel emitter doesn't correctly deal with multiple 981 // output instructions using the same physical register input. The copy to m0 982 // is incorrectly placed before the second instruction. 983 // 984 // TODO: Match source modifiers. 985 986 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 987 const DebugLoc &DL = MI.getDebugLoc(); 988 MachineBasicBlock *MBB = MI.getParent(); 989 990 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 991 .addReg(M0Val); 992 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov) 993 .addImm(2) 994 .addImm(MI.getOperand(4).getImm()) // $attr 995 .addImm(MI.getOperand(3).getImm()); // $attrchan 996 997 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst) 998 .addImm(0) // $src0_modifiers 999 .addReg(Src0) // $src0 1000 .addImm(MI.getOperand(4).getImm()) // $attr 1001 .addImm(MI.getOperand(3).getImm()) // $attrchan 1002 .addImm(0) // $src2_modifiers 1003 .addReg(InterpMov) // $src2 - 2 f16 values selected by high 1004 .addImm(MI.getOperand(5).getImm()) // $high 1005 .addImm(0) // $clamp 1006 .addImm(0); // $omod 1007 1008 MI.eraseFromParent(); 1009 return true; 1010 } 1011 1012 // Writelane is special in that it can use SGPR and M0 (which would normally 1013 // count as using the constant bus twice - but in this case it is allowed since 1014 // the lane selector doesn't count as a use of the constant bus). However, it is 1015 // still required to abide by the 1 SGPR rule. Fix this up if we might have 1016 // multiple SGPRs. 1017 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const { 1018 // With a constant bus limit of at least 2, there's no issue. 1019 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1) 1020 return selectImpl(MI, *CoverageInfo); 1021 1022 MachineBasicBlock *MBB = MI.getParent(); 1023 const DebugLoc &DL = MI.getDebugLoc(); 1024 Register VDst = MI.getOperand(0).getReg(); 1025 Register Val = MI.getOperand(2).getReg(); 1026 Register LaneSelect = MI.getOperand(3).getReg(); 1027 Register VDstIn = MI.getOperand(4).getReg(); 1028 1029 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst); 1030 1031 std::optional<ValueAndVReg> ConstSelect = 1032 getIConstantVRegValWithLookThrough(LaneSelect, *MRI); 1033 if (ConstSelect) { 1034 // The selector has to be an inline immediate, so we can use whatever for 1035 // the other operands. 1036 MIB.addReg(Val); 1037 MIB.addImm(ConstSelect->Value.getSExtValue() & 1038 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2())); 1039 } else { 1040 std::optional<ValueAndVReg> ConstVal = 1041 getIConstantVRegValWithLookThrough(Val, *MRI); 1042 1043 // If the value written is an inline immediate, we can get away without a 1044 // copy to m0. 1045 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(), 1046 STI.hasInv2PiInlineImm())) { 1047 MIB.addImm(ConstVal->Value.getSExtValue()); 1048 MIB.addReg(LaneSelect); 1049 } else { 1050 MIB.addReg(Val); 1051 1052 // If the lane selector was originally in a VGPR and copied with 1053 // readfirstlane, there's a hazard to read the same SGPR from the 1054 // VALU. Constrain to a different SGPR to help avoid needing a nop later. 1055 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI); 1056 1057 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1058 .addReg(LaneSelect); 1059 MIB.addReg(AMDGPU::M0); 1060 } 1061 } 1062 1063 MIB.addReg(VDstIn); 1064 1065 MI.eraseFromParent(); 1066 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1067 } 1068 1069 // We need to handle this here because tablegen doesn't support matching 1070 // instructions with multiple outputs. 1071 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const { 1072 Register Dst0 = MI.getOperand(0).getReg(); 1073 Register Dst1 = MI.getOperand(1).getReg(); 1074 1075 LLT Ty = MRI->getType(Dst0); 1076 unsigned Opc; 1077 if (Ty == LLT::scalar(32)) 1078 Opc = AMDGPU::V_DIV_SCALE_F32_e64; 1079 else if (Ty == LLT::scalar(64)) 1080 Opc = AMDGPU::V_DIV_SCALE_F64_e64; 1081 else 1082 return false; 1083 1084 // TODO: Match source modifiers. 1085 1086 const DebugLoc &DL = MI.getDebugLoc(); 1087 MachineBasicBlock *MBB = MI.getParent(); 1088 1089 Register Numer = MI.getOperand(3).getReg(); 1090 Register Denom = MI.getOperand(4).getReg(); 1091 unsigned ChooseDenom = MI.getOperand(5).getImm(); 1092 1093 Register Src0 = ChooseDenom != 0 ? Numer : Denom; 1094 1095 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 1096 .addDef(Dst1) 1097 .addImm(0) // $src0_modifiers 1098 .addUse(Src0) // $src0 1099 .addImm(0) // $src1_modifiers 1100 .addUse(Denom) // $src1 1101 .addImm(0) // $src2_modifiers 1102 .addUse(Numer) // $src2 1103 .addImm(0) // $clamp 1104 .addImm(0); // $omod 1105 1106 MI.eraseFromParent(); 1107 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1108 } 1109 1110 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { 1111 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); 1112 switch (IntrinsicID) { 1113 case Intrinsic::amdgcn_if_break: { 1114 MachineBasicBlock *BB = I.getParent(); 1115 1116 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1117 // SelectionDAG uses for wave32 vs wave64. 1118 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK)) 1119 .add(I.getOperand(0)) 1120 .add(I.getOperand(2)) 1121 .add(I.getOperand(3)); 1122 1123 Register DstReg = I.getOperand(0).getReg(); 1124 Register Src0Reg = I.getOperand(2).getReg(); 1125 Register Src1Reg = I.getOperand(3).getReg(); 1126 1127 I.eraseFromParent(); 1128 1129 for (Register Reg : { DstReg, Src0Reg, Src1Reg }) 1130 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1131 1132 return true; 1133 } 1134 case Intrinsic::amdgcn_interp_p1_f16: 1135 return selectInterpP1F16(I); 1136 case Intrinsic::amdgcn_wqm: 1137 return constrainCopyLikeIntrin(I, AMDGPU::WQM); 1138 case Intrinsic::amdgcn_softwqm: 1139 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); 1140 case Intrinsic::amdgcn_strict_wwm: 1141 case Intrinsic::amdgcn_wwm: 1142 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); 1143 case Intrinsic::amdgcn_strict_wqm: 1144 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM); 1145 case Intrinsic::amdgcn_writelane: 1146 return selectWritelane(I); 1147 case Intrinsic::amdgcn_div_scale: 1148 return selectDivScale(I); 1149 case Intrinsic::amdgcn_icmp: 1150 case Intrinsic::amdgcn_fcmp: 1151 if (selectImpl(I, *CoverageInfo)) 1152 return true; 1153 return selectIntrinsicCmp(I); 1154 case Intrinsic::amdgcn_ballot: 1155 return selectBallot(I); 1156 case Intrinsic::amdgcn_reloc_constant: 1157 return selectRelocConstant(I); 1158 case Intrinsic::amdgcn_groupstaticsize: 1159 return selectGroupStaticSize(I); 1160 case Intrinsic::returnaddress: 1161 return selectReturnAddress(I); 1162 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 1163 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 1164 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 1165 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 1166 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 1167 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 1168 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 1169 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 1170 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 1171 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 1172 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 1173 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 1174 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 1175 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: 1176 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16: 1177 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16: 1178 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: 1179 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: 1180 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: 1181 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: 1182 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: 1183 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: 1184 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: 1185 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: 1186 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: 1187 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: 1188 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: 1189 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: 1190 return selectSMFMACIntrin(I); 1191 case Intrinsic::amdgcn_permlane16_swap: 1192 case Intrinsic::amdgcn_permlane32_swap: 1193 return selectPermlaneSwapIntrin(I, IntrinsicID); 1194 default: 1195 return selectImpl(I, *CoverageInfo); 1196 } 1197 } 1198 1199 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, 1200 const GCNSubtarget &ST) { 1201 if (Size != 16 && Size != 32 && Size != 64) 1202 return -1; 1203 1204 if (Size == 16 && !ST.has16BitInsts()) 1205 return -1; 1206 1207 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, 1208 unsigned FakeS16Opc, unsigned S32Opc, 1209 unsigned S64Opc) { 1210 if (Size == 16) 1211 return ST.hasTrue16BitInsts() 1212 ? ST.useRealTrue16Insts() ? TrueS16Opc : FakeS16Opc 1213 : S16Opc; 1214 if (Size == 32) 1215 return S32Opc; 1216 return S64Opc; 1217 }; 1218 1219 switch (P) { 1220 default: 1221 llvm_unreachable("Unknown condition code!"); 1222 case CmpInst::ICMP_NE: 1223 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64, 1224 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64, 1225 AMDGPU::V_CMP_NE_U64_e64); 1226 case CmpInst::ICMP_EQ: 1227 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64, 1228 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64, 1229 AMDGPU::V_CMP_EQ_U64_e64); 1230 case CmpInst::ICMP_SGT: 1231 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64, 1232 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64, 1233 AMDGPU::V_CMP_GT_I64_e64); 1234 case CmpInst::ICMP_SGE: 1235 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64, 1236 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64, 1237 AMDGPU::V_CMP_GE_I64_e64); 1238 case CmpInst::ICMP_SLT: 1239 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64, 1240 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64, 1241 AMDGPU::V_CMP_LT_I64_e64); 1242 case CmpInst::ICMP_SLE: 1243 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64, 1244 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64, 1245 AMDGPU::V_CMP_LE_I64_e64); 1246 case CmpInst::ICMP_UGT: 1247 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64, 1248 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64, 1249 AMDGPU::V_CMP_GT_U64_e64); 1250 case CmpInst::ICMP_UGE: 1251 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64, 1252 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64, 1253 AMDGPU::V_CMP_GE_U64_e64); 1254 case CmpInst::ICMP_ULT: 1255 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64, 1256 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64, 1257 AMDGPU::V_CMP_LT_U64_e64); 1258 case CmpInst::ICMP_ULE: 1259 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64, 1260 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64, 1261 AMDGPU::V_CMP_LE_U64_e64); 1262 1263 case CmpInst::FCMP_OEQ: 1264 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64, 1265 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64, 1266 AMDGPU::V_CMP_EQ_F64_e64); 1267 case CmpInst::FCMP_OGT: 1268 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64, 1269 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64, 1270 AMDGPU::V_CMP_GT_F64_e64); 1271 case CmpInst::FCMP_OGE: 1272 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64, 1273 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64, 1274 AMDGPU::V_CMP_GE_F64_e64); 1275 case CmpInst::FCMP_OLT: 1276 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64, 1277 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64, 1278 AMDGPU::V_CMP_LT_F64_e64); 1279 case CmpInst::FCMP_OLE: 1280 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64, 1281 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64, 1282 AMDGPU::V_CMP_LE_F64_e64); 1283 case CmpInst::FCMP_ONE: 1284 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, 1285 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64, 1286 AMDGPU::V_CMP_NEQ_F64_e64); 1287 case CmpInst::FCMP_ORD: 1288 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64, 1289 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64, 1290 AMDGPU::V_CMP_O_F64_e64); 1291 case CmpInst::FCMP_UNO: 1292 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64, 1293 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64, 1294 AMDGPU::V_CMP_U_F64_e64); 1295 case CmpInst::FCMP_UEQ: 1296 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64, 1297 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64, 1298 AMDGPU::V_CMP_NLG_F64_e64); 1299 case CmpInst::FCMP_UGT: 1300 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64, 1301 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64, 1302 AMDGPU::V_CMP_NLE_F64_e64); 1303 case CmpInst::FCMP_UGE: 1304 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64, 1305 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64, 1306 AMDGPU::V_CMP_NLT_F64_e64); 1307 case CmpInst::FCMP_ULT: 1308 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64, 1309 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64, 1310 AMDGPU::V_CMP_NGE_F64_e64); 1311 case CmpInst::FCMP_ULE: 1312 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64, 1313 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64, 1314 AMDGPU::V_CMP_NGT_F64_e64); 1315 case CmpInst::FCMP_UNE: 1316 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64, 1317 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64, 1318 AMDGPU::V_CMP_NEQ_F64_e64); 1319 case CmpInst::FCMP_TRUE: 1320 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64, 1321 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64, 1322 AMDGPU::V_CMP_TRU_F64_e64); 1323 case CmpInst::FCMP_FALSE: 1324 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64, 1325 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64, 1326 AMDGPU::V_CMP_F_F64_e64); 1327 } 1328 } 1329 1330 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P, 1331 unsigned Size) const { 1332 if (Size == 64) { 1333 if (!STI.hasScalarCompareEq64()) 1334 return -1; 1335 1336 switch (P) { 1337 case CmpInst::ICMP_NE: 1338 return AMDGPU::S_CMP_LG_U64; 1339 case CmpInst::ICMP_EQ: 1340 return AMDGPU::S_CMP_EQ_U64; 1341 default: 1342 return -1; 1343 } 1344 } 1345 1346 if (Size == 32) { 1347 switch (P) { 1348 case CmpInst::ICMP_NE: 1349 return AMDGPU::S_CMP_LG_U32; 1350 case CmpInst::ICMP_EQ: 1351 return AMDGPU::S_CMP_EQ_U32; 1352 case CmpInst::ICMP_SGT: 1353 return AMDGPU::S_CMP_GT_I32; 1354 case CmpInst::ICMP_SGE: 1355 return AMDGPU::S_CMP_GE_I32; 1356 case CmpInst::ICMP_SLT: 1357 return AMDGPU::S_CMP_LT_I32; 1358 case CmpInst::ICMP_SLE: 1359 return AMDGPU::S_CMP_LE_I32; 1360 case CmpInst::ICMP_UGT: 1361 return AMDGPU::S_CMP_GT_U32; 1362 case CmpInst::ICMP_UGE: 1363 return AMDGPU::S_CMP_GE_U32; 1364 case CmpInst::ICMP_ULT: 1365 return AMDGPU::S_CMP_LT_U32; 1366 case CmpInst::ICMP_ULE: 1367 return AMDGPU::S_CMP_LE_U32; 1368 case CmpInst::FCMP_OEQ: 1369 return AMDGPU::S_CMP_EQ_F32; 1370 case CmpInst::FCMP_OGT: 1371 return AMDGPU::S_CMP_GT_F32; 1372 case CmpInst::FCMP_OGE: 1373 return AMDGPU::S_CMP_GE_F32; 1374 case CmpInst::FCMP_OLT: 1375 return AMDGPU::S_CMP_LT_F32; 1376 case CmpInst::FCMP_OLE: 1377 return AMDGPU::S_CMP_LE_F32; 1378 case CmpInst::FCMP_ONE: 1379 return AMDGPU::S_CMP_LG_F32; 1380 case CmpInst::FCMP_ORD: 1381 return AMDGPU::S_CMP_O_F32; 1382 case CmpInst::FCMP_UNO: 1383 return AMDGPU::S_CMP_U_F32; 1384 case CmpInst::FCMP_UEQ: 1385 return AMDGPU::S_CMP_NLG_F32; 1386 case CmpInst::FCMP_UGT: 1387 return AMDGPU::S_CMP_NLE_F32; 1388 case CmpInst::FCMP_UGE: 1389 return AMDGPU::S_CMP_NLT_F32; 1390 case CmpInst::FCMP_ULT: 1391 return AMDGPU::S_CMP_NGE_F32; 1392 case CmpInst::FCMP_ULE: 1393 return AMDGPU::S_CMP_NGT_F32; 1394 case CmpInst::FCMP_UNE: 1395 return AMDGPU::S_CMP_NEQ_F32; 1396 default: 1397 llvm_unreachable("Unknown condition code!"); 1398 } 1399 } 1400 1401 if (Size == 16) { 1402 if (!STI.hasSALUFloatInsts()) 1403 return -1; 1404 1405 switch (P) { 1406 case CmpInst::FCMP_OEQ: 1407 return AMDGPU::S_CMP_EQ_F16; 1408 case CmpInst::FCMP_OGT: 1409 return AMDGPU::S_CMP_GT_F16; 1410 case CmpInst::FCMP_OGE: 1411 return AMDGPU::S_CMP_GE_F16; 1412 case CmpInst::FCMP_OLT: 1413 return AMDGPU::S_CMP_LT_F16; 1414 case CmpInst::FCMP_OLE: 1415 return AMDGPU::S_CMP_LE_F16; 1416 case CmpInst::FCMP_ONE: 1417 return AMDGPU::S_CMP_LG_F16; 1418 case CmpInst::FCMP_ORD: 1419 return AMDGPU::S_CMP_O_F16; 1420 case CmpInst::FCMP_UNO: 1421 return AMDGPU::S_CMP_U_F16; 1422 case CmpInst::FCMP_UEQ: 1423 return AMDGPU::S_CMP_NLG_F16; 1424 case CmpInst::FCMP_UGT: 1425 return AMDGPU::S_CMP_NLE_F16; 1426 case CmpInst::FCMP_UGE: 1427 return AMDGPU::S_CMP_NLT_F16; 1428 case CmpInst::FCMP_ULT: 1429 return AMDGPU::S_CMP_NGE_F16; 1430 case CmpInst::FCMP_ULE: 1431 return AMDGPU::S_CMP_NGT_F16; 1432 case CmpInst::FCMP_UNE: 1433 return AMDGPU::S_CMP_NEQ_F16; 1434 default: 1435 llvm_unreachable("Unknown condition code!"); 1436 } 1437 } 1438 1439 return -1; 1440 } 1441 1442 bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const { 1443 1444 MachineBasicBlock *BB = I.getParent(); 1445 const DebugLoc &DL = I.getDebugLoc(); 1446 1447 Register SrcReg = I.getOperand(2).getReg(); 1448 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1449 1450 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate(); 1451 1452 Register CCReg = I.getOperand(0).getReg(); 1453 if (!isVCC(CCReg, *MRI)) { 1454 int Opcode = getS_CMPOpcode(Pred, Size); 1455 if (Opcode == -1) 1456 return false; 1457 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) 1458 .add(I.getOperand(2)) 1459 .add(I.getOperand(3)); 1460 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg) 1461 .addReg(AMDGPU::SCC); 1462 bool Ret = 1463 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && 1464 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); 1465 I.eraseFromParent(); 1466 return Ret; 1467 } 1468 1469 if (I.getOpcode() == AMDGPU::G_FCMP) 1470 return false; 1471 1472 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); 1473 if (Opcode == -1) 1474 return false; 1475 1476 MachineInstrBuilder ICmp; 1477 // t16 instructions 1478 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) { 1479 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg()) 1480 .addImm(0) 1481 .add(I.getOperand(2)) 1482 .addImm(0) 1483 .add(I.getOperand(3)) 1484 .addImm(0); // op_sel 1485 } else { 1486 ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), I.getOperand(0).getReg()) 1487 .add(I.getOperand(2)) 1488 .add(I.getOperand(3)); 1489 } 1490 1491 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), 1492 *TRI.getBoolRC(), *MRI); 1493 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI); 1494 I.eraseFromParent(); 1495 return Ret; 1496 } 1497 1498 bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { 1499 Register Dst = I.getOperand(0).getReg(); 1500 if (isVCC(Dst, *MRI)) 1501 return false; 1502 1503 LLT DstTy = MRI->getType(Dst); 1504 if (DstTy.getSizeInBits() != STI.getWavefrontSize()) 1505 return false; 1506 1507 MachineBasicBlock *BB = I.getParent(); 1508 const DebugLoc &DL = I.getDebugLoc(); 1509 Register SrcReg = I.getOperand(2).getReg(); 1510 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); 1511 1512 // i1 inputs are not supported in GlobalISel. 1513 if (Size == 1) 1514 return false; 1515 1516 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm()); 1517 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) { 1518 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst); 1519 I.eraseFromParent(); 1520 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); 1521 } 1522 1523 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget); 1524 if (Opcode == -1) 1525 return false; 1526 1527 MachineInstrBuilder SelectedMI; 1528 MachineOperand &LHS = I.getOperand(2); 1529 MachineOperand &RHS = I.getOperand(3); 1530 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg()); 1531 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg()); 1532 Register Src0Reg = 1533 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); 1534 Register Src1Reg = 1535 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true); 1536 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst); 1537 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) 1538 SelectedMI.addImm(Src0Mods); 1539 SelectedMI.addReg(Src0Reg); 1540 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers)) 1541 SelectedMI.addImm(Src1Mods); 1542 SelectedMI.addReg(Src1Reg); 1543 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp)) 1544 SelectedMI.addImm(0); // clamp 1545 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) 1546 SelectedMI.addImm(0); // op_sel 1547 1548 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); 1549 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI)) 1550 return false; 1551 1552 I.eraseFromParent(); 1553 return true; 1554 } 1555 1556 // Ballot has to zero bits in input lane-mask that are zero in current exec, 1557 // Done as AND with exec. For inputs that are results of instruction that 1558 // implicitly use same exec, for example compares in same basic block or SCC to 1559 // VCC copy, use copy. 1560 static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, 1561 MachineBasicBlock *MBB) { 1562 MachineInstr *MI = MRI.getVRegDef(Reg); 1563 if (MI->getParent() != MBB) 1564 return false; 1565 1566 // Lane mask generated by SCC to VCC copy. 1567 if (MI->getOpcode() == AMDGPU::COPY) { 1568 auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg()); 1569 auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg()); 1570 if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID && 1571 SrcRB->getID() == AMDGPU::SGPRRegBankID) 1572 return true; 1573 } 1574 1575 // Lane mask generated using compare with same exec. 1576 if (isa<GAnyCmp>(MI)) 1577 return true; 1578 1579 Register LHS, RHS; 1580 // Look through AND. 1581 if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS)))) 1582 return isLaneMaskFromSameBlock(LHS, MRI, MBB) || 1583 isLaneMaskFromSameBlock(RHS, MRI, MBB); 1584 1585 return false; 1586 } 1587 1588 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { 1589 MachineBasicBlock *BB = I.getParent(); 1590 const DebugLoc &DL = I.getDebugLoc(); 1591 Register DstReg = I.getOperand(0).getReg(); 1592 Register SrcReg = I.getOperand(2).getReg(); 1593 const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits(); 1594 const unsigned WaveSize = STI.getWavefrontSize(); 1595 1596 // In the common case, the return type matches the wave size. 1597 // However we also support emitting i64 ballots in wave32 mode. 1598 if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32)) 1599 return false; 1600 1601 std::optional<ValueAndVReg> Arg = 1602 getIConstantVRegValWithLookThrough(SrcReg, *MRI); 1603 1604 Register Dst = DstReg; 1605 // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot. 1606 if (BallotSize != WaveSize) { 1607 Dst = MRI->createVirtualRegister(TRI.getBoolRC()); 1608 } 1609 1610 if (Arg) { 1611 const int64_t Value = Arg->Value.getZExtValue(); 1612 if (Value == 0) { 1613 // Dst = S_MOV 0 1614 unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 1615 BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0); 1616 } else { 1617 // Dst = COPY EXEC 1618 assert(Value == 1); 1619 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec()); 1620 } 1621 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) 1622 return false; 1623 } else { 1624 if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) { 1625 // Dst = COPY SrcReg 1626 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg); 1627 if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) 1628 return false; 1629 } else { 1630 // Dst = S_AND SrcReg, EXEC 1631 unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 1632 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst) 1633 .addReg(SrcReg) 1634 .addReg(TRI.getExec()) 1635 .setOperandDead(3); // Dead scc 1636 if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI)) 1637 return false; 1638 } 1639 } 1640 1641 // i64 ballot on Wave32: zero-extend i32 ballot to i64. 1642 if (BallotSize != WaveSize) { 1643 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1644 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); 1645 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 1646 .addReg(Dst) 1647 .addImm(AMDGPU::sub0) 1648 .addReg(HiReg) 1649 .addImm(AMDGPU::sub1); 1650 } 1651 1652 I.eraseFromParent(); 1653 return true; 1654 } 1655 1656 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { 1657 Register DstReg = I.getOperand(0).getReg(); 1658 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 1659 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank); 1660 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 1661 return false; 1662 1663 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID; 1664 1665 Module *M = MF->getFunction().getParent(); 1666 const MDNode *Metadata = I.getOperand(2).getMetadata(); 1667 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); 1668 auto *RelocSymbol = cast<GlobalVariable>( 1669 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); 1670 1671 MachineBasicBlock *BB = I.getParent(); 1672 BuildMI(*BB, &I, I.getDebugLoc(), 1673 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg) 1674 .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO); 1675 1676 I.eraseFromParent(); 1677 return true; 1678 } 1679 1680 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { 1681 Triple::OSType OS = MF->getTarget().getTargetTriple().getOS(); 1682 1683 Register DstReg = I.getOperand(0).getReg(); 1684 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 1685 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ? 1686 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1687 1688 MachineBasicBlock *MBB = I.getParent(); 1689 const DebugLoc &DL = I.getDebugLoc(); 1690 1691 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg); 1692 1693 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) { 1694 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1695 MIB.addImm(MFI->getLDSSize()); 1696 } else { 1697 Module *M = MF->getFunction().getParent(); 1698 const GlobalValue *GV = 1699 Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize); 1700 MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 1701 } 1702 1703 I.eraseFromParent(); 1704 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1705 } 1706 1707 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { 1708 MachineBasicBlock *MBB = I.getParent(); 1709 MachineFunction &MF = *MBB->getParent(); 1710 const DebugLoc &DL = I.getDebugLoc(); 1711 1712 MachineOperand &Dst = I.getOperand(0); 1713 Register DstReg = Dst.getReg(); 1714 unsigned Depth = I.getOperand(2).getImm(); 1715 1716 const TargetRegisterClass *RC 1717 = TRI.getConstrainedRegClassForOperand(Dst, *MRI); 1718 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) || 1719 !RBI.constrainGenericRegister(DstReg, *RC, *MRI)) 1720 return false; 1721 1722 // Check for kernel and shader functions 1723 if (Depth != 0 || 1724 MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) { 1725 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg) 1726 .addImm(0); 1727 I.eraseFromParent(); 1728 return true; 1729 } 1730 1731 MachineFrameInfo &MFI = MF.getFrameInfo(); 1732 // There is a call to @llvm.returnaddress in this function 1733 MFI.setReturnAddressIsTaken(true); 1734 1735 // Get the return address reg and mark it as an implicit live-in 1736 Register ReturnAddrReg = TRI.getReturnAddressReg(MF); 1737 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg, 1738 AMDGPU::SReg_64RegClass, DL); 1739 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg) 1740 .addReg(LiveIn); 1741 I.eraseFromParent(); 1742 return true; 1743 } 1744 1745 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { 1746 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1747 // SelectionDAG uses for wave32 vs wave64. 1748 MachineBasicBlock *BB = MI.getParent(); 1749 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) 1750 .add(MI.getOperand(1)); 1751 1752 Register Reg = MI.getOperand(1).getReg(); 1753 MI.eraseFromParent(); 1754 1755 if (!MRI->getRegClassOrNull(Reg)) 1756 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass()); 1757 return true; 1758 } 1759 1760 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic( 1761 MachineInstr &MI, Intrinsic::ID IntrID) const { 1762 MachineBasicBlock *MBB = MI.getParent(); 1763 MachineFunction *MF = MBB->getParent(); 1764 const DebugLoc &DL = MI.getDebugLoc(); 1765 1766 unsigned IndexOperand = MI.getOperand(7).getImm(); 1767 bool WaveRelease = MI.getOperand(8).getImm() != 0; 1768 bool WaveDone = MI.getOperand(9).getImm() != 0; 1769 1770 if (WaveDone && !WaveRelease) { 1771 // TODO: Move this to IR verifier 1772 const Function &Fn = MF->getFunction(); 1773 Fn.getContext().diagnose(DiagnosticInfoUnsupported( 1774 Fn, "ds_ordered_count: wave_done requires wave_release", DL)); 1775 } 1776 1777 unsigned OrderedCountIndex = IndexOperand & 0x3f; 1778 IndexOperand &= ~0x3f; 1779 unsigned CountDw = 0; 1780 1781 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) { 1782 CountDw = (IndexOperand >> 24) & 0xf; 1783 IndexOperand &= ~(0xf << 24); 1784 1785 if (CountDw < 1 || CountDw > 4) { 1786 const Function &Fn = MF->getFunction(); 1787 Fn.getContext().diagnose(DiagnosticInfoUnsupported( 1788 Fn, "ds_ordered_count: dword count must be between 1 and 4", DL)); 1789 CountDw = 1; 1790 } 1791 } 1792 1793 if (IndexOperand) { 1794 const Function &Fn = MF->getFunction(); 1795 Fn.getContext().diagnose(DiagnosticInfoUnsupported( 1796 Fn, "ds_ordered_count: bad index operand", DL)); 1797 } 1798 1799 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; 1800 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF); 1801 1802 unsigned Offset0 = OrderedCountIndex << 2; 1803 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4); 1804 1805 if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) 1806 Offset1 |= (CountDw - 1) << 6; 1807 1808 if (STI.getGeneration() < AMDGPUSubtarget::GFX11) 1809 Offset1 |= ShaderType << 2; 1810 1811 unsigned Offset = Offset0 | (Offset1 << 8); 1812 1813 Register M0Val = MI.getOperand(2).getReg(); 1814 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1815 .addReg(M0Val); 1816 1817 Register DstReg = MI.getOperand(0).getReg(); 1818 Register ValReg = MI.getOperand(3).getReg(); 1819 MachineInstrBuilder DS = 1820 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg) 1821 .addReg(ValReg) 1822 .addImm(Offset) 1823 .cloneMemRefs(MI); 1824 1825 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI)) 1826 return false; 1827 1828 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI); 1829 MI.eraseFromParent(); 1830 return Ret; 1831 } 1832 1833 static unsigned gwsIntrinToOpcode(unsigned IntrID) { 1834 switch (IntrID) { 1835 case Intrinsic::amdgcn_ds_gws_init: 1836 return AMDGPU::DS_GWS_INIT; 1837 case Intrinsic::amdgcn_ds_gws_barrier: 1838 return AMDGPU::DS_GWS_BARRIER; 1839 case Intrinsic::amdgcn_ds_gws_sema_v: 1840 return AMDGPU::DS_GWS_SEMA_V; 1841 case Intrinsic::amdgcn_ds_gws_sema_br: 1842 return AMDGPU::DS_GWS_SEMA_BR; 1843 case Intrinsic::amdgcn_ds_gws_sema_p: 1844 return AMDGPU::DS_GWS_SEMA_P; 1845 case Intrinsic::amdgcn_ds_gws_sema_release_all: 1846 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; 1847 default: 1848 llvm_unreachable("not a gws intrinsic"); 1849 } 1850 } 1851 1852 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, 1853 Intrinsic::ID IID) const { 1854 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all && 1855 !STI.hasGWSSemaReleaseAll())) 1856 return false; 1857 1858 // intrinsic ID, vsrc, offset 1859 const bool HasVSrc = MI.getNumOperands() == 3; 1860 assert(HasVSrc || MI.getNumOperands() == 2); 1861 1862 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg(); 1863 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI); 1864 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID) 1865 return false; 1866 1867 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1868 unsigned ImmOffset; 1869 1870 MachineBasicBlock *MBB = MI.getParent(); 1871 const DebugLoc &DL = MI.getDebugLoc(); 1872 1873 MachineInstr *Readfirstlane = nullptr; 1874 1875 // If we legalized the VGPR input, strip out the readfirstlane to analyze the 1876 // incoming offset, in case there's an add of a constant. We'll have to put it 1877 // back later. 1878 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) { 1879 Readfirstlane = OffsetDef; 1880 BaseOffset = OffsetDef->getOperand(1).getReg(); 1881 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI); 1882 } 1883 1884 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) { 1885 // If we have a constant offset, try to use the 0 in m0 as the base. 1886 // TODO: Look into changing the default m0 initialization value. If the 1887 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1888 // the immediate offset. 1889 1890 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue(); 1891 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 1892 .addImm(0); 1893 } else { 1894 std::tie(BaseOffset, ImmOffset) = 1895 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, VT); 1896 1897 if (Readfirstlane) { 1898 // We have the constant offset now, so put the readfirstlane back on the 1899 // variable component. 1900 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI)) 1901 return false; 1902 1903 Readfirstlane->getOperand(1).setReg(BaseOffset); 1904 BaseOffset = Readfirstlane->getOperand(0).getReg(); 1905 } else { 1906 if (!RBI.constrainGenericRegister(BaseOffset, 1907 AMDGPU::SReg_32RegClass, *MRI)) 1908 return false; 1909 } 1910 1911 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1912 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base) 1913 .addReg(BaseOffset) 1914 .addImm(16) 1915 .setOperandDead(3); // Dead scc 1916 1917 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1918 .addReg(M0Base); 1919 } 1920 1921 // The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1922 // offset field) % 64. Some versions of the programming guide omit the m0 1923 // part, or claim it's from offset 0. 1924 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID))); 1925 1926 if (HasVSrc) { 1927 Register VSrc = MI.getOperand(1).getReg(); 1928 MIB.addReg(VSrc); 1929 1930 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI)) 1931 return false; 1932 } 1933 1934 MIB.addImm(ImmOffset) 1935 .cloneMemRefs(MI); 1936 1937 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0); 1938 1939 MI.eraseFromParent(); 1940 return true; 1941 } 1942 1943 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, 1944 bool IsAppend) const { 1945 Register PtrBase = MI.getOperand(2).getReg(); 1946 LLT PtrTy = MRI->getType(PtrBase); 1947 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; 1948 1949 unsigned Offset; 1950 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); 1951 1952 // TODO: Should this try to look through readfirstlane like GWS? 1953 if (!isDSOffsetLegal(PtrBase, Offset)) { 1954 PtrBase = MI.getOperand(2).getReg(); 1955 Offset = 0; 1956 } 1957 1958 MachineBasicBlock *MBB = MI.getParent(); 1959 const DebugLoc &DL = MI.getDebugLoc(); 1960 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; 1961 1962 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 1963 .addReg(PtrBase); 1964 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI)) 1965 return false; 1966 1967 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) 1968 .addImm(Offset) 1969 .addImm(IsGDS ? -1 : 0) 1970 .cloneMemRefs(MI); 1971 MI.eraseFromParent(); 1972 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 1973 } 1974 1975 bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const { 1976 MachineFunction *MF = MI.getParent()->getParent(); 1977 SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>(); 1978 1979 MFInfo->setInitWholeWave(); 1980 return selectImpl(MI, *CoverageInfo); 1981 } 1982 1983 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const { 1984 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID(); 1985 if (TM.getOptLevel() > CodeGenOptLevel::None) { 1986 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second; 1987 if (WGSize <= STI.getWavefrontSize()) { 1988 // If the workgroup fits in a wave, remove s_barrier_signal and lower 1989 // s_barrier/s_barrier_wait to wave_barrier. 1990 if (IntrinsicID == Intrinsic::amdgcn_s_barrier || 1991 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) { 1992 MachineBasicBlock *MBB = MI.getParent(); 1993 const DebugLoc &DL = MI.getDebugLoc(); 1994 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER)); 1995 } 1996 MI.eraseFromParent(); 1997 return true; 1998 } 1999 } 2000 2001 if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) { 2002 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait 2003 MachineBasicBlock *MBB = MI.getParent(); 2004 const DebugLoc &DL = MI.getDebugLoc(); 2005 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM)) 2006 .addImm(AMDGPU::Barrier::WORKGROUP); 2007 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT)) 2008 .addImm(AMDGPU::Barrier::WORKGROUP); 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 return selectImpl(MI, *CoverageInfo); 2014 } 2015 2016 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, 2017 bool &IsTexFail) { 2018 if (TexFailCtrl) 2019 IsTexFail = true; 2020 2021 TFE = TexFailCtrl & 0x1; 2022 TexFailCtrl &= ~(uint64_t)0x1; 2023 LWE = TexFailCtrl & 0x2; 2024 TexFailCtrl &= ~(uint64_t)0x2; 2025 2026 return TexFailCtrl == 0; 2027 } 2028 2029 bool AMDGPUInstructionSelector::selectImageIntrinsic( 2030 MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 2031 MachineBasicBlock *MBB = MI.getParent(); 2032 const DebugLoc &DL = MI.getDebugLoc(); 2033 2034 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 2035 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 2036 2037 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); 2038 unsigned IntrOpcode = Intr->BaseOpcode; 2039 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI); 2040 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI); 2041 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); 2042 2043 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1; 2044 2045 Register VDataIn, VDataOut; 2046 LLT VDataTy; 2047 int NumVDataDwords = -1; 2048 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 || 2049 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16; 2050 2051 bool Unorm; 2052 if (!BaseOpcode->Sampler) 2053 Unorm = true; 2054 else 2055 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0; 2056 2057 bool TFE; 2058 bool LWE; 2059 bool IsTexFail = false; 2060 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(), 2061 TFE, LWE, IsTexFail)) 2062 return false; 2063 2064 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm(); 2065 const bool IsA16 = (Flags & 1) != 0; 2066 const bool IsG16 = (Flags & 2) != 0; 2067 2068 // A16 implies 16 bit gradients if subtarget doesn't support G16 2069 if (IsA16 && !STI.hasG16() && !IsG16) 2070 return false; 2071 2072 unsigned DMask = 0; 2073 unsigned DMaskLanes = 0; 2074 2075 if (BaseOpcode->Atomic) { 2076 VDataOut = MI.getOperand(0).getReg(); 2077 VDataIn = MI.getOperand(2).getReg(); 2078 LLT Ty = MRI->getType(VDataIn); 2079 2080 // Be careful to allow atomic swap on 16-bit element vectors. 2081 const bool Is64Bit = BaseOpcode->AtomicX2 ? 2082 Ty.getSizeInBits() == 128 : 2083 Ty.getSizeInBits() == 64; 2084 2085 if (BaseOpcode->AtomicX2) { 2086 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister); 2087 2088 DMask = Is64Bit ? 0xf : 0x3; 2089 NumVDataDwords = Is64Bit ? 4 : 2; 2090 } else { 2091 DMask = Is64Bit ? 0x3 : 0x1; 2092 NumVDataDwords = Is64Bit ? 2 : 1; 2093 } 2094 } else { 2095 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 2096 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); 2097 2098 if (BaseOpcode->Store) { 2099 VDataIn = MI.getOperand(1).getReg(); 2100 VDataTy = MRI->getType(VDataIn); 2101 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32; 2102 } else if (BaseOpcode->NoReturn) { 2103 NumVDataDwords = 0; 2104 } else { 2105 VDataOut = MI.getOperand(0).getReg(); 2106 VDataTy = MRI->getType(VDataOut); 2107 NumVDataDwords = DMaskLanes; 2108 2109 if (IsD16 && !STI.hasUnpackedD16VMem()) 2110 NumVDataDwords = (DMaskLanes + 1) / 2; 2111 } 2112 } 2113 2114 // Set G16 opcode 2115 if (Subtarget->hasG16() && IsG16) { 2116 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = 2117 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); 2118 assert(G16MappingInfo); 2119 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16 2120 } 2121 2122 // TODO: Check this in verifier. 2123 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this"); 2124 2125 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(); 2126 if (BaseOpcode->Atomic) 2127 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization 2128 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) | 2129 AMDGPU::CPol::VOLATILE)) 2130 return false; 2131 2132 int NumVAddrRegs = 0; 2133 int NumVAddrDwords = 0; 2134 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 2135 // Skip the $noregs and 0s inserted during legalization. 2136 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I); 2137 if (!AddrOp.isReg()) 2138 continue; // XXX - Break? 2139 2140 Register Addr = AddrOp.getReg(); 2141 if (!Addr) 2142 break; 2143 2144 ++NumVAddrRegs; 2145 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32; 2146 } 2147 2148 // The legalizer preprocessed the intrinsic arguments. If we aren't using 2149 // NSA, these should have been packed into a single value in the first 2150 // address register 2151 const bool UseNSA = 2152 NumVAddrRegs != 1 && 2153 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs 2154 : NumVAddrDwords == NumVAddrRegs); 2155 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { 2156 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); 2157 return false; 2158 } 2159 2160 if (IsTexFail) 2161 ++NumVDataDwords; 2162 2163 int Opcode = -1; 2164 if (IsGFX12Plus) { 2165 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12, 2166 NumVDataDwords, NumVAddrDwords); 2167 } else if (IsGFX11Plus) { 2168 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 2169 UseNSA ? AMDGPU::MIMGEncGfx11NSA 2170 : AMDGPU::MIMGEncGfx11Default, 2171 NumVDataDwords, NumVAddrDwords); 2172 } else if (IsGFX10Plus) { 2173 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, 2174 UseNSA ? AMDGPU::MIMGEncGfx10NSA 2175 : AMDGPU::MIMGEncGfx10Default, 2176 NumVDataDwords, NumVAddrDwords); 2177 } else { 2178 if (Subtarget->hasGFX90AInsts()) { 2179 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, 2180 NumVDataDwords, NumVAddrDwords); 2181 if (Opcode == -1) { 2182 LLVM_DEBUG( 2183 dbgs() 2184 << "requested image instruction is not supported on this GPU\n"); 2185 return false; 2186 } 2187 } 2188 if (Opcode == -1 && 2189 STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 2190 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, 2191 NumVDataDwords, NumVAddrDwords); 2192 if (Opcode == -1) 2193 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, 2194 NumVDataDwords, NumVAddrDwords); 2195 } 2196 if (Opcode == -1) 2197 return false; 2198 2199 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) 2200 .cloneMemRefs(MI); 2201 2202 if (VDataOut) { 2203 if (BaseOpcode->AtomicX2) { 2204 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64; 2205 2206 Register TmpReg = MRI->createVirtualRegister( 2207 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass); 2208 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 2209 2210 MIB.addDef(TmpReg); 2211 if (!MRI->use_empty(VDataOut)) { 2212 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut) 2213 .addReg(TmpReg, RegState::Kill, SubReg); 2214 } 2215 2216 } else { 2217 MIB.addDef(VDataOut); // vdata output 2218 } 2219 } 2220 2221 if (VDataIn) 2222 MIB.addReg(VDataIn); // vdata input 2223 2224 for (int I = 0; I != NumVAddrRegs; ++I) { 2225 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I); 2226 if (SrcOp.isReg()) { 2227 assert(SrcOp.getReg() != 0); 2228 MIB.addReg(SrcOp.getReg()); 2229 } 2230 } 2231 2232 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg()); 2233 if (BaseOpcode->Sampler) 2234 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg()); 2235 2236 MIB.addImm(DMask); // dmask 2237 2238 if (IsGFX10Plus) 2239 MIB.addImm(DimInfo->Encoding); 2240 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm)) 2241 MIB.addImm(Unorm); 2242 2243 MIB.addImm(CPol); 2244 MIB.addImm(IsA16 && // a16 or r128 2245 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0); 2246 if (IsGFX10Plus) 2247 MIB.addImm(IsA16 ? -1 : 0); 2248 2249 if (!Subtarget->hasGFX90AInsts()) { 2250 MIB.addImm(TFE); // tfe 2251 } else if (TFE) { 2252 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n"); 2253 return false; 2254 } 2255 2256 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe)) 2257 MIB.addImm(LWE); // lwe 2258 if (!IsGFX10Plus) 2259 MIB.addImm(DimInfo->DA ? -1 : 0); 2260 if (BaseOpcode->HasD16) 2261 MIB.addImm(IsD16 ? -1 : 0); 2262 2263 MI.eraseFromParent(); 2264 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2265 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr); 2266 return true; 2267 } 2268 2269 // We need to handle this here because tablegen doesn't support matching 2270 // instructions with multiple outputs. 2271 bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic( 2272 MachineInstr &MI) const { 2273 Register Dst0 = MI.getOperand(0).getReg(); 2274 Register Dst1 = MI.getOperand(1).getReg(); 2275 2276 const DebugLoc &DL = MI.getDebugLoc(); 2277 MachineBasicBlock *MBB = MI.getParent(); 2278 2279 Register Addr = MI.getOperand(3).getReg(); 2280 Register Data0 = MI.getOperand(4).getReg(); 2281 Register Data1 = MI.getOperand(5).getReg(); 2282 unsigned Offset = MI.getOperand(6).getImm(); 2283 2284 unsigned Opc; 2285 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 2286 case Intrinsic::amdgcn_ds_bvh_stack_rtn: 2287 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: 2288 Opc = AMDGPU::DS_BVH_STACK_RTN_B32; 2289 break; 2290 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: 2291 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32; 2292 break; 2293 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: 2294 Opc = AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64; 2295 break; 2296 } 2297 2298 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) 2299 .addDef(Dst1) 2300 .addUse(Addr) 2301 .addUse(Data0) 2302 .addUse(Data1) 2303 .addImm(Offset) 2304 .cloneMemRefs(MI); 2305 2306 MI.eraseFromParent(); 2307 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 2308 } 2309 2310 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( 2311 MachineInstr &I) const { 2312 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); 2313 switch (IntrinsicID) { 2314 case Intrinsic::amdgcn_end_cf: 2315 return selectEndCfIntrinsic(I); 2316 case Intrinsic::amdgcn_ds_ordered_add: 2317 case Intrinsic::amdgcn_ds_ordered_swap: 2318 return selectDSOrderedIntrinsic(I, IntrinsicID); 2319 case Intrinsic::amdgcn_ds_gws_init: 2320 case Intrinsic::amdgcn_ds_gws_barrier: 2321 case Intrinsic::amdgcn_ds_gws_sema_v: 2322 case Intrinsic::amdgcn_ds_gws_sema_br: 2323 case Intrinsic::amdgcn_ds_gws_sema_p: 2324 case Intrinsic::amdgcn_ds_gws_sema_release_all: 2325 return selectDSGWSIntrinsic(I, IntrinsicID); 2326 case Intrinsic::amdgcn_ds_append: 2327 return selectDSAppendConsume(I, true); 2328 case Intrinsic::amdgcn_ds_consume: 2329 return selectDSAppendConsume(I, false); 2330 case Intrinsic::amdgcn_init_whole_wave: 2331 return selectInitWholeWave(I); 2332 case Intrinsic::amdgcn_s_barrier: 2333 case Intrinsic::amdgcn_s_barrier_signal: 2334 case Intrinsic::amdgcn_s_barrier_wait: 2335 return selectSBarrier(I); 2336 case Intrinsic::amdgcn_raw_buffer_load_lds: 2337 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: 2338 case Intrinsic::amdgcn_struct_buffer_load_lds: 2339 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: 2340 return selectBufferLoadLds(I); 2341 // Until we can store both the address space of the global and the LDS 2342 // arguments by having tto MachineMemOperands on an intrinsic, we just trust 2343 // that the argument is a global pointer (buffer pointers have been handled by 2344 // a LLVM IR-level lowering). 2345 case Intrinsic::amdgcn_load_to_lds: 2346 case Intrinsic::amdgcn_global_load_lds: 2347 return selectGlobalLoadLds(I); 2348 case Intrinsic::amdgcn_exp_compr: 2349 if (!STI.hasCompressedExport()) { 2350 Function &F = I.getMF()->getFunction(); 2351 F.getContext().diagnose( 2352 DiagnosticInfoUnsupported(F, "intrinsic not supported on subtarget", 2353 I.getDebugLoc(), DS_Error)); 2354 return false; 2355 } 2356 break; 2357 case Intrinsic::amdgcn_ds_bvh_stack_rtn: 2358 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn: 2359 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: 2360 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: 2361 return selectDSBvhStackIntrinsic(I); 2362 case Intrinsic::amdgcn_s_barrier_signal_var: 2363 return selectNamedBarrierInit(I, IntrinsicID); 2364 case Intrinsic::amdgcn_s_get_named_barrier_state: 2365 return selectNamedBarrierInst(I, IntrinsicID); 2366 case Intrinsic::amdgcn_s_get_barrier_state: 2367 return selectSGetBarrierState(I, IntrinsicID); 2368 case Intrinsic::amdgcn_s_barrier_signal_isfirst: 2369 return selectSBarrierSignalIsfirst(I, IntrinsicID); 2370 } 2371 return selectImpl(I, *CoverageInfo); 2372 } 2373 2374 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const { 2375 if (selectImpl(I, *CoverageInfo)) 2376 return true; 2377 2378 MachineBasicBlock *BB = I.getParent(); 2379 const DebugLoc &DL = I.getDebugLoc(); 2380 2381 Register DstReg = I.getOperand(0).getReg(); 2382 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI); 2383 assert(Size <= 32 || Size == 64); 2384 const MachineOperand &CCOp = I.getOperand(1); 2385 Register CCReg = CCOp.getReg(); 2386 if (!isVCC(CCReg, *MRI)) { 2387 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : 2388 AMDGPU::S_CSELECT_B32; 2389 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) 2390 .addReg(CCReg); 2391 2392 // The generic constrainSelectedInstRegOperands doesn't work for the scc register 2393 // bank, because it does not cover the register class that we used to represent 2394 // for it. So we need to manually set the register class here. 2395 if (!MRI->getRegClassOrNull(CCReg)) 2396 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI)); 2397 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg) 2398 .add(I.getOperand(2)) 2399 .add(I.getOperand(3)); 2400 2401 bool Ret = false; 2402 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 2403 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); 2404 I.eraseFromParent(); 2405 return Ret; 2406 } 2407 2408 // Wide VGPR select should have been split in RegBankSelect. 2409 if (Size > 32) 2410 return false; 2411 2412 MachineInstr *Select = 2413 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 2414 .addImm(0) 2415 .add(I.getOperand(3)) 2416 .addImm(0) 2417 .add(I.getOperand(2)) 2418 .add(I.getOperand(1)); 2419 2420 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); 2421 I.eraseFromParent(); 2422 return Ret; 2423 } 2424 2425 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const { 2426 Register DstReg = I.getOperand(0).getReg(); 2427 Register SrcReg = I.getOperand(1).getReg(); 2428 const LLT DstTy = MRI->getType(DstReg); 2429 const LLT SrcTy = MRI->getType(SrcReg); 2430 const LLT S1 = LLT::scalar(1); 2431 2432 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 2433 const RegisterBank *DstRB; 2434 if (DstTy == S1) { 2435 // This is a special case. We don't treat s1 for legalization artifacts as 2436 // vcc booleans. 2437 DstRB = SrcRB; 2438 } else { 2439 DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 2440 if (SrcRB != DstRB) 2441 return false; 2442 } 2443 2444 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 2445 2446 unsigned DstSize = DstTy.getSizeInBits(); 2447 unsigned SrcSize = SrcTy.getSizeInBits(); 2448 2449 const TargetRegisterClass *SrcRC = 2450 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB); 2451 const TargetRegisterClass *DstRC = 2452 TRI.getRegClassForSizeOnBank(DstSize, *DstRB); 2453 if (!SrcRC || !DstRC) 2454 return false; 2455 2456 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 2457 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) { 2458 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n"); 2459 return false; 2460 } 2461 2462 if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) { 2463 assert(STI.useRealTrue16Insts()); 2464 const DebugLoc &DL = I.getDebugLoc(); 2465 MachineBasicBlock *MBB = I.getParent(); 2466 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg) 2467 .addReg(SrcReg, 0, AMDGPU::lo16); 2468 I.eraseFromParent(); 2469 return true; 2470 } 2471 2472 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) { 2473 MachineBasicBlock *MBB = I.getParent(); 2474 const DebugLoc &DL = I.getDebugLoc(); 2475 2476 Register LoReg = MRI->createVirtualRegister(DstRC); 2477 Register HiReg = MRI->createVirtualRegister(DstRC); 2478 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg) 2479 .addReg(SrcReg, 0, AMDGPU::sub0); 2480 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg) 2481 .addReg(SrcReg, 0, AMDGPU::sub1); 2482 2483 if (IsVALU && STI.hasSDWA()) { 2484 // Write the low 16-bits of the high element into the high 16-bits of the 2485 // low element. 2486 MachineInstr *MovSDWA = 2487 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg) 2488 .addImm(0) // $src0_modifiers 2489 .addReg(HiReg) // $src0 2490 .addImm(0) // $clamp 2491 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel 2492 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused 2493 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel 2494 .addReg(LoReg, RegState::Implicit); 2495 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1); 2496 } else { 2497 Register TmpReg0 = MRI->createVirtualRegister(DstRC); 2498 Register TmpReg1 = MRI->createVirtualRegister(DstRC); 2499 Register ImmReg = MRI->createVirtualRegister(DstRC); 2500 if (IsVALU) { 2501 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0) 2502 .addImm(16) 2503 .addReg(HiReg); 2504 } else { 2505 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0) 2506 .addReg(HiReg) 2507 .addImm(16) 2508 .setOperandDead(3); // Dead scc 2509 } 2510 2511 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2512 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 2513 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32; 2514 2515 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg) 2516 .addImm(0xffff); 2517 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1) 2518 .addReg(LoReg) 2519 .addReg(ImmReg); 2520 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg) 2521 .addReg(TmpReg0) 2522 .addReg(TmpReg1); 2523 2524 if (!IsVALU) { 2525 And.setOperandDead(3); // Dead scc 2526 Or.setOperandDead(3); // Dead scc 2527 } 2528 } 2529 2530 I.eraseFromParent(); 2531 return true; 2532 } 2533 2534 if (!DstTy.isScalar()) 2535 return false; 2536 2537 if (SrcSize > 32) { 2538 unsigned SubRegIdx = DstSize < 32 2539 ? static_cast<unsigned>(AMDGPU::sub0) 2540 : TRI.getSubRegFromChannel(0, DstSize / 32); 2541 if (SubRegIdx == AMDGPU::NoSubRegister) 2542 return false; 2543 2544 // Deal with weird cases where the class only partially supports the subreg 2545 // index. 2546 const TargetRegisterClass *SrcWithSubRC 2547 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx); 2548 if (!SrcWithSubRC) 2549 return false; 2550 2551 if (SrcWithSubRC != SrcRC) { 2552 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI)) 2553 return false; 2554 } 2555 2556 I.getOperand(1).setSubReg(SubRegIdx); 2557 } 2558 2559 I.setDesc(TII.get(TargetOpcode::COPY)); 2560 return true; 2561 } 2562 2563 /// \returns true if a bitmask for \p Size bits will be an inline immediate. 2564 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) { 2565 Mask = maskTrailingOnes<unsigned>(Size); 2566 int SignedMask = static_cast<int>(Mask); 2567 return SignedMask >= -16 && SignedMask <= 64; 2568 } 2569 2570 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 2571 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( 2572 Register Reg, const MachineRegisterInfo &MRI, 2573 const TargetRegisterInfo &TRI) const { 2574 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); 2575 if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank)) 2576 return RB; 2577 2578 // Ignore the type, since we don't use vcc in artifacts. 2579 if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank)) 2580 return &RBI.getRegBankFromRegClass(*RC, LLT()); 2581 return nullptr; 2582 } 2583 2584 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { 2585 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG; 2586 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg; 2587 const DebugLoc &DL = I.getDebugLoc(); 2588 MachineBasicBlock &MBB = *I.getParent(); 2589 const Register DstReg = I.getOperand(0).getReg(); 2590 const Register SrcReg = I.getOperand(1).getReg(); 2591 2592 const LLT DstTy = MRI->getType(DstReg); 2593 const LLT SrcTy = MRI->getType(SrcReg); 2594 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ? 2595 I.getOperand(2).getImm() : SrcTy.getSizeInBits(); 2596 const unsigned DstSize = DstTy.getSizeInBits(); 2597 if (!DstTy.isScalar()) 2598 return false; 2599 2600 // Artifact casts should never use vcc. 2601 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI); 2602 2603 // FIXME: This should probably be illegal and split earlier. 2604 if (I.getOpcode() == AMDGPU::G_ANYEXT) { 2605 if (DstSize <= 32) 2606 return selectCOPY(I); 2607 2608 const TargetRegisterClass *SrcRC = 2609 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank); 2610 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); 2611 const TargetRegisterClass *DstRC = 2612 TRI.getRegClassForSizeOnBank(DstSize, *DstBank); 2613 2614 Register UndefReg = MRI->createVirtualRegister(SrcRC); 2615 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2616 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2617 .addReg(SrcReg) 2618 .addImm(AMDGPU::sub0) 2619 .addReg(UndefReg) 2620 .addImm(AMDGPU::sub1); 2621 I.eraseFromParent(); 2622 2623 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) && 2624 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI); 2625 } 2626 2627 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) { 2628 // 64-bit should have been split up in RegBankSelect 2629 2630 // Try to use an and with a mask if it will save code size. 2631 unsigned Mask; 2632 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2633 MachineInstr *ExtI = 2634 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg) 2635 .addImm(Mask) 2636 .addReg(SrcReg); 2637 I.eraseFromParent(); 2638 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2639 } 2640 2641 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; 2642 MachineInstr *ExtI = 2643 BuildMI(MBB, I, DL, TII.get(BFE), DstReg) 2644 .addReg(SrcReg) 2645 .addImm(0) // Offset 2646 .addImm(SrcSize); // Width 2647 I.eraseFromParent(); 2648 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI); 2649 } 2650 2651 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) { 2652 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ? 2653 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass; 2654 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI)) 2655 return false; 2656 2657 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) { 2658 const unsigned SextOpc = SrcSize == 8 ? 2659 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16; 2660 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg) 2661 .addReg(SrcReg); 2662 I.eraseFromParent(); 2663 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2664 } 2665 2666 // Using a single 32-bit SALU to calculate the high half is smaller than 2667 // S_BFE with a literal constant operand. 2668 if (DstSize > 32 && SrcSize == 32) { 2669 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2670 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; 2671 if (Signed) { 2672 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg) 2673 .addReg(SrcReg, 0, SubReg) 2674 .addImm(31) 2675 .setOperandDead(3); // Dead scc 2676 } else { 2677 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) 2678 .addImm(0); 2679 } 2680 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 2681 .addReg(SrcReg, 0, SubReg) 2682 .addImm(AMDGPU::sub0) 2683 .addReg(HiReg) 2684 .addImm(AMDGPU::sub1); 2685 I.eraseFromParent(); 2686 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, 2687 *MRI); 2688 } 2689 2690 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64; 2691 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; 2692 2693 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 2694 if (DstSize > 32 && (SrcSize <= 32 || InReg)) { 2695 // We need a 64-bit register source, but the high bits don't matter. 2696 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); 2697 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2698 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister; 2699 2700 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg); 2701 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg) 2702 .addReg(SrcReg, 0, SubReg) 2703 .addImm(AMDGPU::sub0) 2704 .addReg(UndefReg) 2705 .addImm(AMDGPU::sub1); 2706 2707 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg) 2708 .addReg(ExtReg) 2709 .addImm(SrcSize << 16); 2710 2711 I.eraseFromParent(); 2712 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI); 2713 } 2714 2715 unsigned Mask; 2716 if (!Signed && shouldUseAndMask(SrcSize, Mask)) { 2717 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg) 2718 .addReg(SrcReg) 2719 .addImm(Mask) 2720 .setOperandDead(3); // Dead scc 2721 } else { 2722 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg) 2723 .addReg(SrcReg) 2724 .addImm(SrcSize << 16); 2725 } 2726 2727 I.eraseFromParent(); 2728 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI); 2729 } 2730 2731 return false; 2732 } 2733 2734 static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) { 2735 return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg; 2736 } 2737 2738 static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) { 2739 Register BitcastSrc; 2740 if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc)))) 2741 Reg = BitcastSrc; 2742 return Reg; 2743 } 2744 2745 static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, 2746 Register &Out) { 2747 Register Trunc; 2748 if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc)))) 2749 return false; 2750 2751 Register LShlSrc; 2752 Register Cst; 2753 if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst)))) { 2754 Cst = stripCopy(Cst, MRI); 2755 if (mi_match(Cst, MRI, m_SpecificICst(16))) { 2756 Out = stripBitCast(LShlSrc, MRI); 2757 return true; 2758 } 2759 } 2760 2761 MachineInstr *Shuffle = MRI.getVRegDef(Trunc); 2762 if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR) 2763 return false; 2764 2765 assert(MRI.getType(Shuffle->getOperand(0).getReg()) == 2766 LLT::fixed_vector(2, 16)); 2767 2768 ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask(); 2769 assert(Mask.size() == 2); 2770 2771 if (Mask[0] == 1 && Mask[1] <= 1) { 2772 Out = Shuffle->getOperand(0).getReg(); 2773 return true; 2774 } 2775 2776 return false; 2777 } 2778 2779 bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const { 2780 if (!Subtarget->hasSALUFloatInsts()) 2781 return false; 2782 2783 Register Dst = I.getOperand(0).getReg(); 2784 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2785 if (DstRB->getID() != AMDGPU::SGPRRegBankID) 2786 return false; 2787 2788 Register Src = I.getOperand(1).getReg(); 2789 2790 if (MRI->getType(Dst) == LLT::scalar(32) && 2791 MRI->getType(Src) == LLT::scalar(16)) { 2792 if (isExtractHiElt(*MRI, Src, Src)) { 2793 MachineBasicBlock *BB = I.getParent(); 2794 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst) 2795 .addUse(Src); 2796 I.eraseFromParent(); 2797 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); 2798 } 2799 } 2800 2801 return false; 2802 } 2803 2804 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const { 2805 // Only manually handle the f64 SGPR case. 2806 // 2807 // FIXME: This is a workaround for 2.5 different tablegen problems. Because 2808 // the bit ops theoretically have a second result due to the implicit def of 2809 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2810 // that is easy by disabling the check. The result works, but uses a 2811 // nonsensical sreg32orlds_and_sreg_1 regclass. 2812 // 2813 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2814 // the variadic REG_SEQUENCE operands. 2815 2816 Register Dst = MI.getOperand(0).getReg(); 2817 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2818 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2819 MRI->getType(Dst) != LLT::scalar(64)) 2820 return false; 2821 2822 Register Src = MI.getOperand(1).getReg(); 2823 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI); 2824 if (Fabs) 2825 Src = Fabs->getOperand(1).getReg(); 2826 2827 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2828 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2829 return false; 2830 2831 MachineBasicBlock *BB = MI.getParent(); 2832 const DebugLoc &DL = MI.getDebugLoc(); 2833 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2834 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2835 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2836 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2837 2838 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2839 .addReg(Src, 0, AMDGPU::sub0); 2840 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2841 .addReg(Src, 0, AMDGPU::sub1); 2842 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2843 .addImm(0x80000000); 2844 2845 // Set or toggle sign bit. 2846 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32; 2847 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg) 2848 .addReg(HiReg) 2849 .addReg(ConstReg) 2850 .setOperandDead(3); // Dead scc 2851 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2852 .addReg(LoReg) 2853 .addImm(AMDGPU::sub0) 2854 .addReg(OpReg) 2855 .addImm(AMDGPU::sub1); 2856 MI.eraseFromParent(); 2857 return true; 2858 } 2859 2860 // FIXME: This is a workaround for the same tablegen problems as G_FNEG 2861 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const { 2862 Register Dst = MI.getOperand(0).getReg(); 2863 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI); 2864 if (DstRB->getID() != AMDGPU::SGPRRegBankID || 2865 MRI->getType(Dst) != LLT::scalar(64)) 2866 return false; 2867 2868 Register Src = MI.getOperand(1).getReg(); 2869 MachineBasicBlock *BB = MI.getParent(); 2870 const DebugLoc &DL = MI.getDebugLoc(); 2871 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2872 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2873 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2874 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 2875 2876 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) || 2877 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI)) 2878 return false; 2879 2880 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg) 2881 .addReg(Src, 0, AMDGPU::sub0); 2882 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg) 2883 .addReg(Src, 0, AMDGPU::sub1); 2884 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg) 2885 .addImm(0x7fffffff); 2886 2887 // Clear sign bit. 2888 // TODO: Should this used S_BITSET0_*? 2889 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg) 2890 .addReg(HiReg) 2891 .addReg(ConstReg) 2892 .setOperandDead(3); // Dead scc 2893 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst) 2894 .addReg(LoReg) 2895 .addImm(AMDGPU::sub0) 2896 .addReg(OpReg) 2897 .addImm(AMDGPU::sub1); 2898 2899 MI.eraseFromParent(); 2900 return true; 2901 } 2902 2903 static bool isConstant(const MachineInstr &MI) { 2904 return MI.getOpcode() == TargetOpcode::G_CONSTANT; 2905 } 2906 2907 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, 2908 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { 2909 2910 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1; 2911 const MachineInstr *PtrMI = 2912 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg()); 2913 2914 assert(PtrMI); 2915 2916 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD) 2917 return; 2918 2919 GEPInfo GEPInfo; 2920 2921 for (unsigned i = 1; i != 3; ++i) { 2922 const MachineOperand &GEPOp = PtrMI->getOperand(i); 2923 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg()); 2924 assert(OpDef); 2925 if (i == 2 && isConstant(*OpDef)) { 2926 // TODO: Could handle constant base + variable offset, but a combine 2927 // probably should have commuted it. 2928 assert(GEPInfo.Imm == 0); 2929 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue(); 2930 continue; 2931 } 2932 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI); 2933 if (OpBank->getID() == AMDGPU::SGPRRegBankID) 2934 GEPInfo.SgprParts.push_back(GEPOp.getReg()); 2935 else 2936 GEPInfo.VgprParts.push_back(GEPOp.getReg()); 2937 } 2938 2939 AddrInfo.push_back(GEPInfo); 2940 getAddrModeInfo(*PtrMI, MRI, AddrInfo); 2941 } 2942 2943 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const { 2944 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; 2945 } 2946 2947 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { 2948 if (!MI.hasOneMemOperand()) 2949 return false; 2950 2951 const MachineMemOperand *MMO = *MI.memoperands_begin(); 2952 const Value *Ptr = MMO->getValue(); 2953 2954 // UndefValue means this is a load of a kernel input. These are uniform. 2955 // Sometimes LDS instructions have constant pointers. 2956 // If Ptr is null, then that means this mem operand contains a 2957 // PseudoSourceValue like GOT. 2958 if (!Ptr || isa<UndefValue, Argument, Constant, GlobalValue>(Ptr)) 2959 return true; 2960 2961 if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2962 return true; 2963 2964 if (MI.getOpcode() == AMDGPU::G_PREFETCH) 2965 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() == 2966 AMDGPU::SGPRRegBankID; 2967 2968 const Instruction *I = dyn_cast<Instruction>(Ptr); 2969 return I && I->getMetadata("amdgpu.uniform"); 2970 } 2971 2972 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const { 2973 for (const GEPInfo &GEPInfo : AddrInfo) { 2974 if (!GEPInfo.VgprParts.empty()) 2975 return true; 2976 } 2977 return false; 2978 } 2979 2980 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { 2981 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); 2982 unsigned AS = PtrTy.getAddressSpace(); 2983 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) && 2984 STI.ldsRequiresM0Init()) { 2985 MachineBasicBlock *BB = I.getParent(); 2986 2987 // If DS instructions require M0 initialization, insert it before selecting. 2988 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) 2989 .addImm(-1); 2990 } 2991 } 2992 2993 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( 2994 MachineInstr &I) const { 2995 initM0(I); 2996 return selectImpl(I, *CoverageInfo); 2997 } 2998 2999 static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) { 3000 if (Reg.isPhysical()) 3001 return false; 3002 3003 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg); 3004 const unsigned Opcode = MI.getOpcode(); 3005 3006 if (Opcode == AMDGPU::COPY) 3007 return isVCmpResult(MI.getOperand(1).getReg(), MRI); 3008 3009 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR || 3010 Opcode == AMDGPU::G_XOR) 3011 return isVCmpResult(MI.getOperand(1).getReg(), MRI) && 3012 isVCmpResult(MI.getOperand(2).getReg(), MRI); 3013 3014 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) 3015 return GI->is(Intrinsic::amdgcn_class); 3016 3017 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP; 3018 } 3019 3020 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const { 3021 MachineBasicBlock *BB = I.getParent(); 3022 MachineOperand &CondOp = I.getOperand(0); 3023 Register CondReg = CondOp.getReg(); 3024 const DebugLoc &DL = I.getDebugLoc(); 3025 3026 unsigned BrOpcode; 3027 Register CondPhysReg; 3028 const TargetRegisterClass *ConstrainRC; 3029 3030 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide 3031 // whether the branch is uniform when selecting the instruction. In 3032 // GlobalISel, we should push that decision into RegBankSelect. Assume for now 3033 // RegBankSelect knows what it's doing if the branch condition is scc, even 3034 // though it currently does not. 3035 if (!isVCC(CondReg, *MRI)) { 3036 if (MRI->getType(CondReg) != LLT::scalar(32)) 3037 return false; 3038 3039 CondPhysReg = AMDGPU::SCC; 3040 BrOpcode = AMDGPU::S_CBRANCH_SCC1; 3041 ConstrainRC = &AMDGPU::SReg_32RegClass; 3042 } else { 3043 // FIXME: Should scc->vcc copies and with exec? 3044 3045 // Unless the value of CondReg is a result of a V_CMP* instruction then we 3046 // need to insert an and with exec. 3047 if (!isVCmpResult(CondReg, *MRI)) { 3048 const bool Is64 = STI.isWave64(); 3049 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; 3050 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; 3051 3052 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC()); 3053 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg) 3054 .addReg(CondReg) 3055 .addReg(Exec) 3056 .setOperandDead(3); // Dead scc 3057 CondReg = TmpReg; 3058 } 3059 3060 CondPhysReg = TRI.getVCC(); 3061 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ; 3062 ConstrainRC = TRI.getBoolRC(); 3063 } 3064 3065 if (!MRI->getRegClassOrNull(CondReg)) 3066 MRI->setRegClass(CondReg, ConstrainRC); 3067 3068 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg) 3069 .addReg(CondReg); 3070 BuildMI(*BB, &I, DL, TII.get(BrOpcode)) 3071 .addMBB(I.getOperand(1).getMBB()); 3072 3073 I.eraseFromParent(); 3074 return true; 3075 } 3076 3077 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE( 3078 MachineInstr &I) const { 3079 Register DstReg = I.getOperand(0).getReg(); 3080 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3081 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 3082 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32)); 3083 if (IsVGPR) 3084 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 3085 3086 return RBI.constrainGenericRegister( 3087 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI); 3088 } 3089 3090 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { 3091 Register DstReg = I.getOperand(0).getReg(); 3092 Register SrcReg = I.getOperand(1).getReg(); 3093 Register MaskReg = I.getOperand(2).getReg(); 3094 LLT Ty = MRI->getType(DstReg); 3095 LLT MaskTy = MRI->getType(MaskReg); 3096 MachineBasicBlock *BB = I.getParent(); 3097 const DebugLoc &DL = I.getDebugLoc(); 3098 3099 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3100 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 3101 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI); 3102 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID; 3103 if (DstRB != SrcRB) // Should only happen for hand written MIR. 3104 return false; 3105 3106 // Try to avoid emitting a bit operation when we only need to touch half of 3107 // the 64-bit pointer. 3108 APInt MaskOnes = VT->getKnownOnes(MaskReg).zext(64); 3109 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); 3110 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); 3111 3112 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32; 3113 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32; 3114 3115 if (!IsVGPR && Ty.getSizeInBits() == 64 && 3116 !CanCopyLow32 && !CanCopyHi32) { 3117 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg) 3118 .addReg(SrcReg) 3119 .addReg(MaskReg) 3120 .setOperandDead(3); // Dead scc 3121 I.eraseFromParent(); 3122 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3123 } 3124 3125 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32; 3126 const TargetRegisterClass &RegRC 3127 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 3128 3129 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB); 3130 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB); 3131 const TargetRegisterClass *MaskRC = 3132 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB); 3133 3134 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 3135 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 3136 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI)) 3137 return false; 3138 3139 if (Ty.getSizeInBits() == 32) { 3140 assert(MaskTy.getSizeInBits() == 32 && 3141 "ptrmask should have been narrowed during legalize"); 3142 3143 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg) 3144 .addReg(SrcReg) 3145 .addReg(MaskReg); 3146 3147 if (!IsVGPR) 3148 NewOp.setOperandDead(3); // Dead scc 3149 I.eraseFromParent(); 3150 return true; 3151 } 3152 3153 Register HiReg = MRI->createVirtualRegister(&RegRC); 3154 Register LoReg = MRI->createVirtualRegister(&RegRC); 3155 3156 // Extract the subregisters from the source pointer. 3157 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg) 3158 .addReg(SrcReg, 0, AMDGPU::sub0); 3159 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg) 3160 .addReg(SrcReg, 0, AMDGPU::sub1); 3161 3162 Register MaskedLo, MaskedHi; 3163 3164 if (CanCopyLow32) { 3165 // If all the bits in the low half are 1, we only need a copy for it. 3166 MaskedLo = LoReg; 3167 } else { 3168 // Extract the mask subregister and apply the and. 3169 Register MaskLo = MRI->createVirtualRegister(&RegRC); 3170 MaskedLo = MRI->createVirtualRegister(&RegRC); 3171 3172 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo) 3173 .addReg(MaskReg, 0, AMDGPU::sub0); 3174 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo) 3175 .addReg(LoReg) 3176 .addReg(MaskLo); 3177 } 3178 3179 if (CanCopyHi32) { 3180 // If all the bits in the high half are 1, we only need a copy for it. 3181 MaskedHi = HiReg; 3182 } else { 3183 Register MaskHi = MRI->createVirtualRegister(&RegRC); 3184 MaskedHi = MRI->createVirtualRegister(&RegRC); 3185 3186 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi) 3187 .addReg(MaskReg, 0, AMDGPU::sub1); 3188 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi) 3189 .addReg(HiReg) 3190 .addReg(MaskHi); 3191 } 3192 3193 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) 3194 .addReg(MaskedLo) 3195 .addImm(AMDGPU::sub0) 3196 .addReg(MaskedHi) 3197 .addImm(AMDGPU::sub1); 3198 I.eraseFromParent(); 3199 return true; 3200 } 3201 3202 /// Return the register to use for the index value, and the subregister to use 3203 /// for the indirectly accessed register. 3204 static std::pair<Register, unsigned> 3205 computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, 3206 const TargetRegisterClass *SuperRC, Register IdxReg, 3207 unsigned EltSize, GISelValueTracking &ValueTracking) { 3208 Register IdxBaseReg; 3209 int Offset; 3210 3211 std::tie(IdxBaseReg, Offset) = 3212 AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &ValueTracking); 3213 if (IdxBaseReg == AMDGPU::NoRegister) { 3214 // This will happen if the index is a known constant. This should ordinarily 3215 // be legalized out, but handle it as a register just in case. 3216 assert(Offset == 0); 3217 IdxBaseReg = IdxReg; 3218 } 3219 3220 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize); 3221 3222 // Skip out of bounds offsets, or else we would end up using an undefined 3223 // register. 3224 if (static_cast<unsigned>(Offset) >= SubRegs.size()) 3225 return std::pair(IdxReg, SubRegs[0]); 3226 return std::pair(IdxBaseReg, SubRegs[Offset]); 3227 } 3228 3229 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( 3230 MachineInstr &MI) const { 3231 Register DstReg = MI.getOperand(0).getReg(); 3232 Register SrcReg = MI.getOperand(1).getReg(); 3233 Register IdxReg = MI.getOperand(2).getReg(); 3234 3235 LLT DstTy = MRI->getType(DstReg); 3236 LLT SrcTy = MRI->getType(SrcReg); 3237 3238 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3239 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI); 3240 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 3241 3242 // The index must be scalar. If it wasn't RegBankSelect should have moved this 3243 // into a waterfall loop. 3244 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 3245 return false; 3246 3247 const TargetRegisterClass *SrcRC = 3248 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB); 3249 const TargetRegisterClass *DstRC = 3250 TRI.getRegClassForTypeOnBank(DstTy, *DstRB); 3251 if (!SrcRC || !DstRC) 3252 return false; 3253 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) || 3254 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) || 3255 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 3256 return false; 3257 3258 MachineBasicBlock *BB = MI.getParent(); 3259 const DebugLoc &DL = MI.getDebugLoc(); 3260 const bool Is64 = DstTy.getSizeInBits() == 64; 3261 3262 unsigned SubReg; 3263 std::tie(IdxReg, SubReg) = computeIndirectRegIndex( 3264 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *VT); 3265 3266 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { 3267 if (DstTy.getSizeInBits() != 32 && !Is64) 3268 return false; 3269 3270 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3271 .addReg(IdxReg); 3272 3273 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32; 3274 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg) 3275 .addReg(SrcReg, 0, SubReg) 3276 .addReg(SrcReg, RegState::Implicit); 3277 MI.eraseFromParent(); 3278 return true; 3279 } 3280 3281 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32) 3282 return false; 3283 3284 if (!STI.useVGPRIndexMode()) { 3285 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3286 .addReg(IdxReg); 3287 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg) 3288 .addReg(SrcReg, 0, SubReg) 3289 .addReg(SrcReg, RegState::Implicit); 3290 MI.eraseFromParent(); 3291 return true; 3292 } 3293 3294 const MCInstrDesc &GPRIDXDesc = 3295 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true); 3296 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 3297 .addReg(SrcReg) 3298 .addReg(IdxReg) 3299 .addImm(SubReg); 3300 3301 MI.eraseFromParent(); 3302 return true; 3303 } 3304 3305 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 3306 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( 3307 MachineInstr &MI) const { 3308 Register DstReg = MI.getOperand(0).getReg(); 3309 Register VecReg = MI.getOperand(1).getReg(); 3310 Register ValReg = MI.getOperand(2).getReg(); 3311 Register IdxReg = MI.getOperand(3).getReg(); 3312 3313 LLT VecTy = MRI->getType(DstReg); 3314 LLT ValTy = MRI->getType(ValReg); 3315 unsigned VecSize = VecTy.getSizeInBits(); 3316 unsigned ValSize = ValTy.getSizeInBits(); 3317 3318 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); 3319 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); 3320 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); 3321 3322 assert(VecTy.getElementType() == ValTy); 3323 3324 // The index must be scalar. If it wasn't RegBankSelect should have moved this 3325 // into a waterfall loop. 3326 if (IdxRB->getID() != AMDGPU::SGPRRegBankID) 3327 return false; 3328 3329 const TargetRegisterClass *VecRC = 3330 TRI.getRegClassForTypeOnBank(VecTy, *VecRB); 3331 const TargetRegisterClass *ValRC = 3332 TRI.getRegClassForTypeOnBank(ValTy, *ValRB); 3333 3334 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || 3335 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || 3336 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || 3337 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) 3338 return false; 3339 3340 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) 3341 return false; 3342 3343 unsigned SubReg; 3344 std::tie(IdxReg, SubReg) = 3345 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *VT); 3346 3347 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && 3348 STI.useVGPRIndexMode(); 3349 3350 MachineBasicBlock *BB = MI.getParent(); 3351 const DebugLoc &DL = MI.getDebugLoc(); 3352 3353 if (!IndexMode) { 3354 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3355 .addReg(IdxReg); 3356 3357 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo( 3358 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID); 3359 BuildMI(*BB, MI, DL, RegWriteOp, DstReg) 3360 .addReg(VecReg) 3361 .addReg(ValReg) 3362 .addImm(SubReg); 3363 MI.eraseFromParent(); 3364 return true; 3365 } 3366 3367 const MCInstrDesc &GPRIDXDesc = 3368 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); 3369 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg) 3370 .addReg(VecReg) 3371 .addReg(ValReg) 3372 .addReg(IdxReg) 3373 .addImm(SubReg); 3374 3375 MI.eraseFromParent(); 3376 return true; 3377 } 3378 3379 bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { 3380 if (!Subtarget->hasVMemToLDSLoad()) 3381 return false; 3382 unsigned Opc; 3383 unsigned Size = MI.getOperand(3).getImm(); 3384 3385 // The struct intrinsic variants add one additional operand over raw. 3386 const bool HasVIndex = MI.getNumOperands() == 9; 3387 Register VIndex; 3388 int OpOffset = 0; 3389 if (HasVIndex) { 3390 VIndex = MI.getOperand(4).getReg(); 3391 OpOffset = 1; 3392 } 3393 3394 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3395 std::optional<ValueAndVReg> MaybeVOffset = 3396 getIConstantVRegValWithLookThrough(VOffset, *MRI); 3397 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue(); 3398 3399 switch (Size) { 3400 default: 3401 return false; 3402 case 1: 3403 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN 3404 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN 3405 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN 3406 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET; 3407 break; 3408 case 2: 3409 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN 3410 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN 3411 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN 3412 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET; 3413 break; 3414 case 4: 3415 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN 3416 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN 3417 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN 3418 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; 3419 break; 3420 case 12: 3421 if (!Subtarget->hasLDSLoadB96_B128()) 3422 return false; 3423 3424 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN 3425 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN 3426 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN 3427 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET; 3428 break; 3429 case 16: 3430 if (!Subtarget->hasLDSLoadB96_B128()) 3431 return false; 3432 3433 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN 3434 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN 3435 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN 3436 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET; 3437 break; 3438 } 3439 3440 MachineBasicBlock *MBB = MI.getParent(); 3441 const DebugLoc &DL = MI.getDebugLoc(); 3442 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3443 .add(MI.getOperand(2)); 3444 3445 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)); 3446 3447 if (HasVIndex && HasVOffset) { 3448 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class()); 3449 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg) 3450 .addReg(VIndex) 3451 .addImm(AMDGPU::sub0) 3452 .addReg(VOffset) 3453 .addImm(AMDGPU::sub1); 3454 3455 MIB.addReg(IdxReg); 3456 } else if (HasVIndex) { 3457 MIB.addReg(VIndex); 3458 } else if (HasVOffset) { 3459 MIB.addReg(VOffset); 3460 } 3461 3462 MIB.add(MI.getOperand(1)); // rsrc 3463 MIB.add(MI.getOperand(5 + OpOffset)); // soffset 3464 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset 3465 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); 3466 unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); 3467 MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL 3468 : AMDGPU::CPol::ALL_pregfx12)); // cpol 3469 MIB.addImm( 3470 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12) 3471 ? 1 3472 : 0); // swz 3473 3474 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3475 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3476 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm(); 3477 MachinePointerInfo StorePtrI = LoadPtrI; 3478 StorePtrI.V = nullptr; 3479 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3480 3481 auto F = LoadMMO->getFlags() & 3482 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3483 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3484 Size, LoadMMO->getBaseAlign()); 3485 3486 MachineMemOperand *StoreMMO = 3487 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3488 sizeof(int32_t), LoadMMO->getBaseAlign()); 3489 3490 MIB.setMemRefs({LoadMMO, StoreMMO}); 3491 3492 MI.eraseFromParent(); 3493 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3494 } 3495 3496 /// Match a zero extend from a 32-bit value to 64-bits. 3497 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { 3498 Register ZExtSrc; 3499 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) 3500 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); 3501 3502 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) 3503 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 3504 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) 3505 return Register(); 3506 3507 assert(Def->getNumOperands() == 3 && 3508 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); 3509 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { 3510 return Def->getOperand(1).getReg(); 3511 } 3512 3513 return Register(); 3514 } 3515 3516 bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ 3517 if (!Subtarget->hasVMemToLDSLoad()) 3518 return false; 3519 3520 unsigned Opc; 3521 unsigned Size = MI.getOperand(3).getImm(); 3522 3523 switch (Size) { 3524 default: 3525 return false; 3526 case 1: 3527 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; 3528 break; 3529 case 2: 3530 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; 3531 break; 3532 case 4: 3533 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; 3534 break; 3535 case 12: 3536 if (!Subtarget->hasLDSLoadB96_B128()) 3537 return false; 3538 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3; 3539 break; 3540 case 16: 3541 if (!Subtarget->hasLDSLoadB96_B128()) 3542 return false; 3543 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4; 3544 break; 3545 } 3546 3547 MachineBasicBlock *MBB = MI.getParent(); 3548 const DebugLoc &DL = MI.getDebugLoc(); 3549 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 3550 .add(MI.getOperand(2)); 3551 3552 Register Addr = MI.getOperand(1).getReg(); 3553 Register VOffset; 3554 // Try to split SAddr and VOffset. Global and LDS pointers share the same 3555 // immediate offset, so we cannot use a regular SelectGlobalSAddr(). 3556 if (!isSGPR(Addr)) { 3557 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 3558 if (isSGPR(AddrDef->Reg)) { 3559 Addr = AddrDef->Reg; 3560 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 3561 Register SAddr = 3562 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 3563 if (isSGPR(SAddr)) { 3564 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 3565 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 3566 Addr = SAddr; 3567 VOffset = Off; 3568 } 3569 } 3570 } 3571 } 3572 3573 if (isSGPR(Addr)) { 3574 Opc = AMDGPU::getGlobalSaddrOp(Opc); 3575 if (!VOffset) { 3576 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3577 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 3578 .addImm(0); 3579 } 3580 } 3581 3582 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) 3583 .addReg(Addr); 3584 3585 if (isSGPR(Addr)) 3586 MIB.addReg(VOffset); 3587 3588 MIB.add(MI.getOperand(4)) // offset 3589 .add(MI.getOperand(5)); // cpol 3590 3591 MachineMemOperand *LoadMMO = *MI.memoperands_begin(); 3592 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); 3593 LoadPtrI.Offset = MI.getOperand(4).getImm(); 3594 MachinePointerInfo StorePtrI = LoadPtrI; 3595 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; 3596 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; 3597 auto F = LoadMMO->getFlags() & 3598 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); 3599 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, 3600 Size, LoadMMO->getBaseAlign()); 3601 MachineMemOperand *StoreMMO = 3602 MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, 3603 sizeof(int32_t), Align(4)); 3604 3605 MIB.setMemRefs({LoadMMO, StoreMMO}); 3606 3607 MI.eraseFromParent(); 3608 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3609 } 3610 3611 bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic( 3612 MachineInstr &MI) const { 3613 unsigned OpcodeOpIdx = 3614 MI.getOpcode() == AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY ? 1 : 3; 3615 MI.setDesc(TII.get(MI.getOperand(OpcodeOpIdx).getImm())); 3616 MI.removeOperand(OpcodeOpIdx); 3617 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3618 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 3619 } 3620 3621 // FIXME: This should be removed and let the patterns select. We just need the 3622 // AGPR/VGPR combination versions. 3623 bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { 3624 unsigned Opc; 3625 switch (cast<GIntrinsic>(MI).getIntrinsicID()) { 3626 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16: 3627 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64; 3628 break; 3629 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16: 3630 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64; 3631 break; 3632 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16: 3633 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64; 3634 break; 3635 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16: 3636 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64; 3637 break; 3638 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8: 3639 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64; 3640 break; 3641 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: 3642 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64; 3643 break; 3644 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8: 3645 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64; 3646 break; 3647 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8: 3648 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64; 3649 break; 3650 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8: 3651 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64; 3652 break; 3653 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8: 3654 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64; 3655 break; 3656 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8: 3657 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64; 3658 break; 3659 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8: 3660 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64; 3661 break; 3662 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8: 3663 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64; 3664 break; 3665 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: 3666 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64; 3667 break; 3668 case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16: 3669 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64; 3670 break; 3671 case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16: 3672 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64; 3673 break; 3674 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16: 3675 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64; 3676 break; 3677 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16: 3678 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64; 3679 break; 3680 case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8: 3681 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64; 3682 break; 3683 case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8: 3684 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64; 3685 break; 3686 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: 3687 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64; 3688 break; 3689 case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: 3690 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64; 3691 break; 3692 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: 3693 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64; 3694 break; 3695 case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: 3696 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64; 3697 break; 3698 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8: 3699 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64; 3700 break; 3701 case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8: 3702 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64; 3703 break; 3704 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8: 3705 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64; 3706 break; 3707 case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: 3708 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64; 3709 break; 3710 default: 3711 llvm_unreachable("unhandled smfmac intrinsic"); 3712 } 3713 3714 auto VDst_In = MI.getOperand(4); 3715 3716 MI.setDesc(TII.get(Opc)); 3717 MI.removeOperand(4); // VDst_In 3718 MI.removeOperand(1); // Intrinsic ID 3719 MI.addOperand(VDst_In); // Readd VDst_In to the end 3720 MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); 3721 return true; 3722 } 3723 3724 bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin( 3725 MachineInstr &MI, Intrinsic::ID IntrID) const { 3726 if (IntrID == Intrinsic::amdgcn_permlane16_swap && 3727 !Subtarget->hasPermlane16Swap()) 3728 return false; 3729 if (IntrID == Intrinsic::amdgcn_permlane32_swap && 3730 !Subtarget->hasPermlane32Swap()) 3731 return false; 3732 3733 unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap 3734 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64 3735 : AMDGPU::V_PERMLANE32_SWAP_B32_e64; 3736 3737 MI.removeOperand(2); 3738 MI.setDesc(TII.get(Opcode)); 3739 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 3740 3741 MachineOperand &FI = MI.getOperand(4); 3742 FI.setImm(FI.getImm() ? AMDGPU::DPP::DPP_FI_1 : AMDGPU::DPP::DPP_FI_0); 3743 3744 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); 3745 } 3746 3747 bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const { 3748 Register DstReg = MI.getOperand(0).getReg(); 3749 Register SrcReg = MI.getOperand(1).getReg(); 3750 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3751 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 3752 MachineBasicBlock *MBB = MI.getParent(); 3753 const DebugLoc &DL = MI.getDebugLoc(); 3754 3755 if (IsVALU) { 3756 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg) 3757 .addImm(Subtarget->getWavefrontSizeLog2()) 3758 .addReg(SrcReg); 3759 } else { 3760 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg) 3761 .addReg(SrcReg) 3762 .addImm(Subtarget->getWavefrontSizeLog2()) 3763 .setOperandDead(3); // Dead scc 3764 } 3765 3766 const TargetRegisterClass &RC = 3767 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass; 3768 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI)) 3769 return false; 3770 3771 MI.eraseFromParent(); 3772 return true; 3773 } 3774 3775 // Match BITOP3 operation and return a number of matched instructions plus 3776 // truth table. 3777 static std::pair<unsigned, uint8_t> BitOp3_Op(Register R, 3778 SmallVectorImpl<Register> &Src, 3779 const MachineRegisterInfo &MRI) { 3780 unsigned NumOpcodes = 0; 3781 uint8_t LHSBits, RHSBits; 3782 3783 auto getOperandBits = [&Src, R, &MRI](Register Op, uint8_t &Bits) -> bool { 3784 // Define truth table given Src0, Src1, Src2 bits permutations: 3785 // 0 0 0 3786 // 0 0 1 3787 // 0 1 0 3788 // 0 1 1 3789 // 1 0 0 3790 // 1 0 1 3791 // 1 1 0 3792 // 1 1 1 3793 const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa }; 3794 3795 if (mi_match(Op, MRI, m_AllOnesInt())) { 3796 Bits = 0xff; 3797 return true; 3798 } 3799 if (mi_match(Op, MRI, m_ZeroInt())) { 3800 Bits = 0; 3801 return true; 3802 } 3803 3804 for (unsigned I = 0; I < Src.size(); ++I) { 3805 // Try to find existing reused operand 3806 if (Src[I] == Op) { 3807 Bits = SrcBits[I]; 3808 return true; 3809 } 3810 // Try to replace parent operator 3811 if (Src[I] == R) { 3812 Bits = SrcBits[I]; 3813 Src[I] = Op; 3814 return true; 3815 } 3816 } 3817 3818 if (Src.size() == 3) { 3819 // No room left for operands. Try one last time, there can be a 'not' of 3820 // one of our source operands. In this case we can compute the bits 3821 // without growing Src vector. 3822 Register LHS; 3823 if (mi_match(Op, MRI, m_Not(m_Reg(LHS)))) { 3824 LHS = getSrcRegIgnoringCopies(LHS, MRI); 3825 for (unsigned I = 0; I < Src.size(); ++I) { 3826 if (Src[I] == LHS) { 3827 Bits = ~SrcBits[I]; 3828 return true; 3829 } 3830 } 3831 } 3832 3833 return false; 3834 } 3835 3836 Bits = SrcBits[Src.size()]; 3837 Src.push_back(Op); 3838 return true; 3839 }; 3840 3841 MachineInstr *MI = MRI.getVRegDef(R); 3842 switch (MI->getOpcode()) { 3843 case TargetOpcode::G_AND: 3844 case TargetOpcode::G_OR: 3845 case TargetOpcode::G_XOR: { 3846 Register LHS = getSrcRegIgnoringCopies(MI->getOperand(1).getReg(), MRI); 3847 Register RHS = getSrcRegIgnoringCopies(MI->getOperand(2).getReg(), MRI); 3848 3849 SmallVector<Register, 3> Backup(Src.begin(), Src.end()); 3850 if (!getOperandBits(LHS, LHSBits) || 3851 !getOperandBits(RHS, RHSBits)) { 3852 Src = Backup; 3853 return std::make_pair(0, 0); 3854 } 3855 3856 // Recursion is naturally limited by the size of the operand vector. 3857 auto Op = BitOp3_Op(LHS, Src, MRI); 3858 if (Op.first) { 3859 NumOpcodes += Op.first; 3860 LHSBits = Op.second; 3861 } 3862 3863 Op = BitOp3_Op(RHS, Src, MRI); 3864 if (Op.first) { 3865 NumOpcodes += Op.first; 3866 RHSBits = Op.second; 3867 } 3868 break; 3869 } 3870 default: 3871 return std::make_pair(0, 0); 3872 } 3873 3874 uint8_t TTbl; 3875 switch (MI->getOpcode()) { 3876 case TargetOpcode::G_AND: 3877 TTbl = LHSBits & RHSBits; 3878 break; 3879 case TargetOpcode::G_OR: 3880 TTbl = LHSBits | RHSBits; 3881 break; 3882 case TargetOpcode::G_XOR: 3883 TTbl = LHSBits ^ RHSBits; 3884 break; 3885 default: 3886 break; 3887 } 3888 3889 return std::make_pair(NumOpcodes + 1, TTbl); 3890 } 3891 3892 bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const { 3893 if (!Subtarget->hasBitOp3Insts()) 3894 return false; 3895 3896 Register DstReg = MI.getOperand(0).getReg(); 3897 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI); 3898 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID; 3899 if (!IsVALU) 3900 return false; 3901 3902 SmallVector<Register, 3> Src; 3903 uint8_t TTbl; 3904 unsigned NumOpcodes; 3905 3906 std::tie(NumOpcodes, TTbl) = BitOp3_Op(DstReg, Src, *MRI); 3907 3908 // Src.empty() case can happen if all operands are all zero or all ones. 3909 // Normally it shall be optimized out before reaching this. 3910 if (NumOpcodes < 2 || Src.empty()) 3911 return false; 3912 3913 const bool IsB32 = MRI->getType(DstReg) == LLT::scalar(32); 3914 if (NumOpcodes == 2 && IsB32) { 3915 // Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes 3916 // asm more readable. This cannot be modeled with AddedComplexity because 3917 // selector does not know how many operations did we match. 3918 if (mi_match(MI, *MRI, m_GXor(m_GXor(m_Reg(), m_Reg()), m_Reg())) || 3919 mi_match(MI, *MRI, m_GOr(m_GOr(m_Reg(), m_Reg()), m_Reg())) || 3920 mi_match(MI, *MRI, m_GOr(m_GAnd(m_Reg(), m_Reg()), m_Reg()))) 3921 return false; 3922 } else if (NumOpcodes < 4) { 3923 // For a uniform case threshold should be higher to account for moves 3924 // between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be 3925 // in SGPRs and a readtfirstlane after. 3926 return false; 3927 } 3928 3929 unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64; 3930 unsigned CBL = STI.getConstantBusLimit(Opc); 3931 MachineBasicBlock *MBB = MI.getParent(); 3932 const DebugLoc &DL = MI.getDebugLoc(); 3933 3934 for (unsigned I = 0; I < Src.size(); ++I) { 3935 const RegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI); 3936 if (RB->getID() != AMDGPU::SGPRRegBankID) 3937 continue; 3938 if (CBL > 0) { 3939 --CBL; 3940 continue; 3941 } 3942 Register NewReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 3943 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), NewReg) 3944 .addReg(Src[I]); 3945 Src[I] = NewReg; 3946 } 3947 3948 // Last operand can be ignored, turning a ternary operation into a binary. 3949 // For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace 3950 // 'c' with 'a' here without changing the answer. In some pathological 3951 // cases it should be possible to get an operation with a single operand 3952 // too if optimizer would not catch it. 3953 while (Src.size() < 3) 3954 Src.push_back(Src[0]); 3955 3956 auto MIB = BuildMI(*MBB, MI, DL, TII.get(Opc), DstReg); 3957 if (!IsB32) 3958 MIB.addImm(0); // src_mod0 3959 MIB.addReg(Src[0]); 3960 if (!IsB32) 3961 MIB.addImm(0); // src_mod1 3962 MIB.addReg(Src[1]); 3963 if (!IsB32) 3964 MIB.addImm(0); // src_mod2 3965 MIB.addReg(Src[2]) 3966 .addImm(TTbl); 3967 if (!IsB32) 3968 MIB.addImm(0); // op_sel 3969 3970 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); 3971 MI.eraseFromParent(); 3972 3973 return true; 3974 } 3975 3976 bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const { 3977 Register SrcReg = MI.getOperand(0).getReg(); 3978 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI)) 3979 return false; 3980 3981 MachineInstr *DefMI = MRI->getVRegDef(SrcReg); 3982 Register SP = 3983 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore(); 3984 Register WaveAddr = getWaveAddress(DefMI); 3985 MachineBasicBlock *MBB = MI.getParent(); 3986 const DebugLoc &DL = MI.getDebugLoc(); 3987 3988 if (!WaveAddr) { 3989 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 3990 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr) 3991 .addReg(SrcReg) 3992 .addImm(Subtarget->getWavefrontSizeLog2()) 3993 .setOperandDead(3); // Dead scc 3994 } 3995 3996 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP) 3997 .addReg(WaveAddr); 3998 3999 MI.eraseFromParent(); 4000 return true; 4001 } 4002 4003 bool AMDGPUInstructionSelector::select(MachineInstr &I) { 4004 4005 if (!I.isPreISelOpcode()) { 4006 if (I.isCopy()) 4007 return selectCOPY(I); 4008 return true; 4009 } 4010 4011 switch (I.getOpcode()) { 4012 case TargetOpcode::G_AND: 4013 case TargetOpcode::G_OR: 4014 case TargetOpcode::G_XOR: 4015 if (selectBITOP3(I)) 4016 return true; 4017 if (selectImpl(I, *CoverageInfo)) 4018 return true; 4019 return selectG_AND_OR_XOR(I); 4020 case TargetOpcode::G_ADD: 4021 case TargetOpcode::G_SUB: 4022 case TargetOpcode::G_PTR_ADD: 4023 if (selectImpl(I, *CoverageInfo)) 4024 return true; 4025 return selectG_ADD_SUB(I); 4026 case TargetOpcode::G_UADDO: 4027 case TargetOpcode::G_USUBO: 4028 case TargetOpcode::G_UADDE: 4029 case TargetOpcode::G_USUBE: 4030 return selectG_UADDO_USUBO_UADDE_USUBE(I); 4031 case AMDGPU::G_AMDGPU_MAD_U64_U32: 4032 case AMDGPU::G_AMDGPU_MAD_I64_I32: 4033 return selectG_AMDGPU_MAD_64_32(I); 4034 case TargetOpcode::G_INTTOPTR: 4035 case TargetOpcode::G_BITCAST: 4036 case TargetOpcode::G_PTRTOINT: 4037 case TargetOpcode::G_FREEZE: 4038 return selectCOPY(I); 4039 case TargetOpcode::G_FNEG: 4040 if (selectImpl(I, *CoverageInfo)) 4041 return true; 4042 return selectG_FNEG(I); 4043 case TargetOpcode::G_FABS: 4044 if (selectImpl(I, *CoverageInfo)) 4045 return true; 4046 return selectG_FABS(I); 4047 case TargetOpcode::G_EXTRACT: 4048 return selectG_EXTRACT(I); 4049 case TargetOpcode::G_MERGE_VALUES: 4050 case TargetOpcode::G_CONCAT_VECTORS: 4051 return selectG_MERGE_VALUES(I); 4052 case TargetOpcode::G_UNMERGE_VALUES: 4053 return selectG_UNMERGE_VALUES(I); 4054 case TargetOpcode::G_BUILD_VECTOR: 4055 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 4056 return selectG_BUILD_VECTOR(I); 4057 case TargetOpcode::G_IMPLICIT_DEF: 4058 return selectG_IMPLICIT_DEF(I); 4059 case TargetOpcode::G_INSERT: 4060 return selectG_INSERT(I); 4061 case TargetOpcode::G_INTRINSIC: 4062 case TargetOpcode::G_INTRINSIC_CONVERGENT: 4063 return selectG_INTRINSIC(I); 4064 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: 4065 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: 4066 return selectG_INTRINSIC_W_SIDE_EFFECTS(I); 4067 case TargetOpcode::G_ICMP: 4068 case TargetOpcode::G_FCMP: 4069 if (selectG_ICMP_or_FCMP(I)) 4070 return true; 4071 return selectImpl(I, *CoverageInfo); 4072 case TargetOpcode::G_LOAD: 4073 case TargetOpcode::G_ZEXTLOAD: 4074 case TargetOpcode::G_SEXTLOAD: 4075 case TargetOpcode::G_STORE: 4076 case TargetOpcode::G_ATOMIC_CMPXCHG: 4077 case TargetOpcode::G_ATOMICRMW_XCHG: 4078 case TargetOpcode::G_ATOMICRMW_ADD: 4079 case TargetOpcode::G_ATOMICRMW_SUB: 4080 case TargetOpcode::G_ATOMICRMW_AND: 4081 case TargetOpcode::G_ATOMICRMW_OR: 4082 case TargetOpcode::G_ATOMICRMW_XOR: 4083 case TargetOpcode::G_ATOMICRMW_MIN: 4084 case TargetOpcode::G_ATOMICRMW_MAX: 4085 case TargetOpcode::G_ATOMICRMW_UMIN: 4086 case TargetOpcode::G_ATOMICRMW_UMAX: 4087 case TargetOpcode::G_ATOMICRMW_UINC_WRAP: 4088 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: 4089 case TargetOpcode::G_ATOMICRMW_FADD: 4090 case TargetOpcode::G_ATOMICRMW_FMIN: 4091 case TargetOpcode::G_ATOMICRMW_FMAX: 4092 return selectG_LOAD_STORE_ATOMICRMW(I); 4093 case TargetOpcode::G_SELECT: 4094 return selectG_SELECT(I); 4095 case TargetOpcode::G_TRUNC: 4096 return selectG_TRUNC(I); 4097 case TargetOpcode::G_SEXT: 4098 case TargetOpcode::G_ZEXT: 4099 case TargetOpcode::G_ANYEXT: 4100 case TargetOpcode::G_SEXT_INREG: 4101 // This is a workaround. For extension from type i1, `selectImpl()` uses 4102 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type 4103 // i1 can only be hold in a SGPR class. 4104 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) && 4105 selectImpl(I, *CoverageInfo)) 4106 return true; 4107 return selectG_SZA_EXT(I); 4108 case TargetOpcode::G_FPEXT: 4109 if (selectG_FPEXT(I)) 4110 return true; 4111 return selectImpl(I, *CoverageInfo); 4112 case TargetOpcode::G_BRCOND: 4113 return selectG_BRCOND(I); 4114 case TargetOpcode::G_GLOBAL_VALUE: 4115 return selectG_GLOBAL_VALUE(I); 4116 case TargetOpcode::G_PTRMASK: 4117 return selectG_PTRMASK(I); 4118 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 4119 return selectG_EXTRACT_VECTOR_ELT(I); 4120 case TargetOpcode::G_INSERT_VECTOR_ELT: 4121 return selectG_INSERT_VECTOR_ELT(I); 4122 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4123 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16: 4124 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET: 4125 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: 4126 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: { 4127 const AMDGPU::ImageDimIntrinsicInfo *Intr = 4128 AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I)); 4129 assert(Intr && "not an image intrinsic with image pseudo"); 4130 return selectImageIntrinsic(I, Intr); 4131 } 4132 case AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY: 4133 case AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY: 4134 case AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY: 4135 return selectBVHIntersectRayIntrinsic(I); 4136 case AMDGPU::G_SBFX: 4137 case AMDGPU::G_UBFX: 4138 return selectG_SBFX_UBFX(I); 4139 case AMDGPU::G_SI_CALL: 4140 I.setDesc(TII.get(AMDGPU::SI_CALL)); 4141 return true; 4142 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: 4143 return selectWaveAddress(I); 4144 case AMDGPU::G_STACKRESTORE: 4145 return selectStackRestore(I); 4146 case AMDGPU::G_PHI: 4147 return selectPHI(I); 4148 case AMDGPU::G_AMDGPU_COPY_SCC_VCC: 4149 return selectCOPY_SCC_VCC(I); 4150 case AMDGPU::G_AMDGPU_COPY_VCC_SCC: 4151 return selectCOPY_VCC_SCC(I); 4152 case AMDGPU::G_AMDGPU_READANYLANE: 4153 return selectReadAnyLane(I); 4154 case TargetOpcode::G_CONSTANT: 4155 case TargetOpcode::G_FCONSTANT: 4156 default: 4157 return selectImpl(I, *CoverageInfo); 4158 } 4159 return false; 4160 } 4161 4162 InstructionSelector::ComplexRendererFns 4163 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { 4164 return {{ 4165 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 4166 }}; 4167 4168 } 4169 4170 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( 4171 Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const { 4172 unsigned Mods = 0; 4173 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); 4174 4175 if (MI->getOpcode() == AMDGPU::G_FNEG) { 4176 Src = MI->getOperand(1).getReg(); 4177 Mods |= SISrcMods::NEG; 4178 MI = getDefIgnoringCopies(Src, *MRI); 4179 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) { 4180 // Fold fsub [+-]0 into fneg. This may not have folded depending on the 4181 // denormal mode, but we're implicitly canonicalizing in a source operand. 4182 const ConstantFP *LHS = 4183 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI); 4184 if (LHS && LHS->isZero()) { 4185 Mods |= SISrcMods::NEG; 4186 Src = MI->getOperand(2).getReg(); 4187 } 4188 } 4189 4190 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) { 4191 Src = MI->getOperand(1).getReg(); 4192 Mods |= SISrcMods::ABS; 4193 } 4194 4195 if (OpSel) 4196 Mods |= SISrcMods::OP_SEL_0; 4197 4198 return std::pair(Src, Mods); 4199 } 4200 4201 Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded( 4202 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, 4203 bool ForceVGPR) const { 4204 if ((Mods != 0 || ForceVGPR) && 4205 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) { 4206 4207 // If we looked through copies to find source modifiers on an SGPR operand, 4208 // we now have an SGPR register source. To avoid potentially violating the 4209 // constant bus restriction, we need to insert a copy to a VGPR. 4210 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg()); 4211 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(), 4212 TII.get(AMDGPU::COPY), VGPRSrc) 4213 .addReg(Src); 4214 Src = VGPRSrc; 4215 } 4216 4217 return Src; 4218 } 4219 4220 /// 4221 /// This will select either an SGPR or VGPR operand and will save us from 4222 /// having to write an extra tablegen pattern. 4223 InstructionSelector::ComplexRendererFns 4224 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { 4225 return {{ 4226 [=](MachineInstrBuilder &MIB) { MIB.add(Root); } 4227 }}; 4228 } 4229 4230 InstructionSelector::ComplexRendererFns 4231 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { 4232 Register Src; 4233 unsigned Mods; 4234 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 4235 4236 return {{ 4237 [=](MachineInstrBuilder &MIB) { 4238 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4239 }, 4240 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4241 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 4242 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 4243 }}; 4244 } 4245 4246 InstructionSelector::ComplexRendererFns 4247 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { 4248 Register Src; 4249 unsigned Mods; 4250 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), 4251 /*IsCanonicalizing=*/true, 4252 /*AllowAbs=*/false); 4253 4254 return {{ 4255 [=](MachineInstrBuilder &MIB) { 4256 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4257 }, 4258 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 4259 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 4260 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 4261 }}; 4262 } 4263 4264 InstructionSelector::ComplexRendererFns 4265 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { 4266 return {{ 4267 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, 4268 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp 4269 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod 4270 }}; 4271 } 4272 4273 InstructionSelector::ComplexRendererFns 4274 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { 4275 Register Src; 4276 unsigned Mods; 4277 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 4278 4279 return {{ 4280 [=](MachineInstrBuilder &MIB) { 4281 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4282 }, 4283 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4284 }}; 4285 } 4286 4287 InstructionSelector::ComplexRendererFns 4288 AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( 4289 MachineOperand &Root) const { 4290 Register Src; 4291 unsigned Mods; 4292 std::tie(Src, Mods) = 4293 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/false); 4294 4295 return {{ 4296 [=](MachineInstrBuilder &MIB) { 4297 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4298 }, 4299 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4300 }}; 4301 } 4302 4303 InstructionSelector::ComplexRendererFns 4304 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { 4305 Register Src; 4306 unsigned Mods; 4307 std::tie(Src, Mods) = 4308 selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, 4309 /*AllowAbs=*/false); 4310 4311 return {{ 4312 [=](MachineInstrBuilder &MIB) { 4313 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); 4314 }, 4315 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4316 }}; 4317 } 4318 4319 InstructionSelector::ComplexRendererFns 4320 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { 4321 Register Reg = Root.getReg(); 4322 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); 4323 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS) 4324 return {}; 4325 return {{ 4326 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 4327 }}; 4328 } 4329 4330 enum class SrcStatus { 4331 IS_SAME, 4332 IS_UPPER_HALF, 4333 IS_LOWER_HALF, 4334 IS_UPPER_HALF_NEG, 4335 // This means current op = [op_upper, op_lower] and src = -op_lower. 4336 IS_LOWER_HALF_NEG, 4337 IS_HI_NEG, 4338 // This means current op = [op_upper, op_lower] and src = [op_upper, 4339 // -op_lower]. 4340 IS_LO_NEG, 4341 IS_BOTH_NEG, 4342 INVALID, 4343 NEG_START = IS_UPPER_HALF_NEG, 4344 NEG_END = IS_BOTH_NEG, 4345 HALF_START = IS_UPPER_HALF, 4346 HALF_END = IS_LOWER_HALF_NEG 4347 }; 4348 /// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n` 4349 static bool isTruncHalf(const MachineInstr *MI, 4350 const MachineRegisterInfo &MRI) { 4351 if (MI->getOpcode() != AMDGPU::G_TRUNC) 4352 return false; 4353 4354 unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); 4355 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); 4356 return DstSize * 2 == SrcSize; 4357 } 4358 4359 /// Test if the MI is logic shift right with half bits, 4360 /// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)` 4361 static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { 4362 if (MI->getOpcode() != AMDGPU::G_LSHR) 4363 return false; 4364 4365 Register ShiftSrc; 4366 std::optional<ValueAndVReg> ShiftAmt; 4367 if (mi_match(MI->getOperand(0).getReg(), MRI, 4368 m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { 4369 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); 4370 unsigned Shift = ShiftAmt->Value.getZExtValue(); 4371 return Shift * 2 == SrcSize; 4372 } 4373 return false; 4374 } 4375 4376 /// Test if the MI is shift left with half bits, 4377 /// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)` 4378 static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { 4379 if (MI->getOpcode() != AMDGPU::G_SHL) 4380 return false; 4381 4382 Register ShiftSrc; 4383 std::optional<ValueAndVReg> ShiftAmt; 4384 if (mi_match(MI->getOperand(0).getReg(), MRI, 4385 m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { 4386 unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); 4387 unsigned Shift = ShiftAmt->Value.getZExtValue(); 4388 return Shift * 2 == SrcSize; 4389 } 4390 return false; 4391 } 4392 4393 /// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n` 4394 static bool isUnmergeHalf(const MachineInstr *MI, 4395 const MachineRegisterInfo &MRI) { 4396 if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES) 4397 return false; 4398 return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() && 4399 MI->getOperand(1).isDef() && !MI->getOperand(2).isDef(); 4400 } 4401 4402 enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED }; 4403 4404 static TypeClass isVectorOfTwoOrScalar(Register Reg, 4405 const MachineRegisterInfo &MRI) { 4406 LLT OpTy = MRI.getType(Reg); 4407 if (OpTy.isScalar()) 4408 return TypeClass::SCALAR; 4409 if (OpTy.isVector() && OpTy.getNumElements() == 2) 4410 return TypeClass::VECTOR_OF_TWO; 4411 return TypeClass::NONE_OF_LISTED; 4412 } 4413 4414 static SrcStatus getNegStatus(Register Reg, SrcStatus S, 4415 const MachineRegisterInfo &MRI) { 4416 TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI); 4417 if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR) 4418 return SrcStatus::INVALID; 4419 4420 switch (S) { 4421 case SrcStatus::IS_SAME: 4422 if (NegType == TypeClass::VECTOR_OF_TWO) { 4423 // Vector of 2: 4424 // [SrcHi, SrcLo] = [CurrHi, CurrLo] 4425 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) 4426 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) 4427 // [SrcHi, SrcLo] = [-OpHi, -OpLo] 4428 return SrcStatus::IS_BOTH_NEG; 4429 } 4430 if (NegType == TypeClass::SCALAR) { 4431 // Scalar: 4432 // [SrcHi, SrcLo] = [CurrHi, CurrLo] 4433 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) 4434 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) 4435 // [SrcHi, SrcLo] = [-OpHi, OpLo] 4436 return SrcStatus::IS_HI_NEG; 4437 } 4438 break; 4439 case SrcStatus::IS_HI_NEG: 4440 if (NegType == TypeClass::VECTOR_OF_TWO) { 4441 // Vector of 2: 4442 // [SrcHi, SrcLo] = [-CurrHi, CurrLo] 4443 // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) 4444 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) 4445 // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo] 4446 return SrcStatus::IS_LO_NEG; 4447 } 4448 if (NegType == TypeClass::SCALAR) { 4449 // Scalar: 4450 // [SrcHi, SrcLo] = [-CurrHi, CurrLo] 4451 // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) 4452 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) 4453 // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo] 4454 return SrcStatus::IS_SAME; 4455 } 4456 break; 4457 case SrcStatus::IS_LO_NEG: 4458 if (NegType == TypeClass::VECTOR_OF_TWO) { 4459 // Vector of 2: 4460 // [SrcHi, SrcLo] = [CurrHi, -CurrLo] 4461 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) 4462 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) 4463 // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo] 4464 return SrcStatus::IS_HI_NEG; 4465 } 4466 if (NegType == TypeClass::SCALAR) { 4467 // Scalar: 4468 // [SrcHi, SrcLo] = [CurrHi, -CurrLo] 4469 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) 4470 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) 4471 // [SrcHi, SrcLo] = [-OpHi, -OpLo] 4472 return SrcStatus::IS_BOTH_NEG; 4473 } 4474 break; 4475 case SrcStatus::IS_BOTH_NEG: 4476 if (NegType == TypeClass::VECTOR_OF_TWO) { 4477 // Vector of 2: 4478 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] 4479 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) 4480 // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) 4481 // [SrcHi, SrcLo] = [OpHi, OpLo] 4482 return SrcStatus::IS_SAME; 4483 } 4484 if (NegType == TypeClass::SCALAR) { 4485 // Scalar: 4486 // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] 4487 // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) 4488 // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) 4489 // [SrcHi, SrcLo] = [OpHi, -OpLo] 4490 return SrcStatus::IS_LO_NEG; 4491 } 4492 break; 4493 case SrcStatus::IS_UPPER_HALF: 4494 // Vector of 2: 4495 // Src = CurrUpper 4496 // Curr = [CurrUpper, CurrLower] 4497 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) 4498 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) 4499 // Src = -OpUpper 4500 // 4501 // Scalar: 4502 // Src = CurrUpper 4503 // Curr = [CurrUpper, CurrLower] 4504 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) 4505 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) 4506 // Src = -OpUpper 4507 return SrcStatus::IS_UPPER_HALF_NEG; 4508 case SrcStatus::IS_LOWER_HALF: 4509 if (NegType == TypeClass::VECTOR_OF_TWO) { 4510 // Vector of 2: 4511 // Src = CurrLower 4512 // Curr = [CurrUpper, CurrLower] 4513 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) 4514 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) 4515 // Src = -OpLower 4516 return SrcStatus::IS_LOWER_HALF_NEG; 4517 } 4518 if (NegType == TypeClass::SCALAR) { 4519 // Scalar: 4520 // Src = CurrLower 4521 // Curr = [CurrUpper, CurrLower] 4522 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) 4523 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) 4524 // Src = OpLower 4525 return SrcStatus::IS_LOWER_HALF; 4526 } 4527 break; 4528 case SrcStatus::IS_UPPER_HALF_NEG: 4529 // Vector of 2: 4530 // Src = -CurrUpper 4531 // Curr = [CurrUpper, CurrLower] 4532 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) 4533 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) 4534 // Src = -(-OpUpper) = OpUpper 4535 // 4536 // Scalar: 4537 // Src = -CurrUpper 4538 // Curr = [CurrUpper, CurrLower] 4539 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) 4540 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) 4541 // Src = -(-OpUpper) = OpUpper 4542 return SrcStatus::IS_UPPER_HALF; 4543 case SrcStatus::IS_LOWER_HALF_NEG: 4544 if (NegType == TypeClass::VECTOR_OF_TWO) { 4545 // Vector of 2: 4546 // Src = -CurrLower 4547 // Curr = [CurrUpper, CurrLower] 4548 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) 4549 // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) 4550 // Src = -(-OpLower) = OpLower 4551 return SrcStatus::IS_LOWER_HALF; 4552 } 4553 if (NegType == TypeClass::SCALAR) { 4554 // Scalar: 4555 // Src = -CurrLower 4556 // Curr = [CurrUpper, CurrLower] 4557 // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) 4558 // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) 4559 // Src = -OpLower 4560 return SrcStatus::IS_LOWER_HALF_NEG; 4561 } 4562 break; 4563 default: 4564 break; 4565 } 4566 llvm_unreachable("unexpected SrcStatus & NegType combination"); 4567 } 4568 4569 static std::optional<std::pair<Register, SrcStatus>> 4570 calcNextStatus(std::pair<Register, SrcStatus> Curr, 4571 const MachineRegisterInfo &MRI) { 4572 const MachineInstr *MI = MRI.getVRegDef(Curr.first); 4573 4574 unsigned Opc = MI->getOpcode(); 4575 4576 // Handle general Opc cases. 4577 switch (Opc) { 4578 case AMDGPU::G_BITCAST: 4579 return std::optional<std::pair<Register, SrcStatus>>( 4580 {MI->getOperand(1).getReg(), Curr.second}); 4581 case AMDGPU::COPY: 4582 if (MI->getOperand(1).getReg().isPhysical()) 4583 return std::nullopt; 4584 return std::optional<std::pair<Register, SrcStatus>>( 4585 {MI->getOperand(1).getReg(), Curr.second}); 4586 case AMDGPU::G_FNEG: { 4587 SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI); 4588 if (Stat == SrcStatus::INVALID) 4589 return std::nullopt; 4590 return std::optional<std::pair<Register, SrcStatus>>( 4591 {MI->getOperand(1).getReg(), Stat}); 4592 } 4593 default: 4594 break; 4595 } 4596 4597 // Calc next Stat from current Stat. 4598 switch (Curr.second) { 4599 case SrcStatus::IS_SAME: 4600 if (isTruncHalf(MI, MRI)) 4601 return std::optional<std::pair<Register, SrcStatus>>( 4602 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF}); 4603 else if (isUnmergeHalf(MI, MRI)) { 4604 if (Curr.first == MI->getOperand(0).getReg()) 4605 return std::optional<std::pair<Register, SrcStatus>>( 4606 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF}); 4607 return std::optional<std::pair<Register, SrcStatus>>( 4608 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF}); 4609 } 4610 break; 4611 case SrcStatus::IS_HI_NEG: 4612 if (isTruncHalf(MI, MRI)) { 4613 // [SrcHi, SrcLo] = [-CurrHi, CurrLo] 4614 // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower 4615 // = [OpLowerHi, OpLowerLo] 4616 // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo] 4617 // = [-OpLowerHi, OpLowerLo] 4618 // = -OpLower 4619 return std::optional<std::pair<Register, SrcStatus>>( 4620 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG}); 4621 } 4622 if (isUnmergeHalf(MI, MRI)) { 4623 if (Curr.first == MI->getOperand(0).getReg()) 4624 return std::optional<std::pair<Register, SrcStatus>>( 4625 {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG}); 4626 return std::optional<std::pair<Register, SrcStatus>>( 4627 {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG}); 4628 } 4629 break; 4630 case SrcStatus::IS_UPPER_HALF: 4631 if (isShlHalf(MI, MRI)) 4632 return std::optional<std::pair<Register, SrcStatus>>( 4633 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF}); 4634 break; 4635 case SrcStatus::IS_LOWER_HALF: 4636 if (isLshrHalf(MI, MRI)) 4637 return std::optional<std::pair<Register, SrcStatus>>( 4638 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF}); 4639 break; 4640 case SrcStatus::IS_UPPER_HALF_NEG: 4641 if (isShlHalf(MI, MRI)) 4642 return std::optional<std::pair<Register, SrcStatus>>( 4643 {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG}); 4644 break; 4645 case SrcStatus::IS_LOWER_HALF_NEG: 4646 if (isLshrHalf(MI, MRI)) 4647 return std::optional<std::pair<Register, SrcStatus>>( 4648 {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG}); 4649 break; 4650 default: 4651 break; 4652 } 4653 return std::nullopt; 4654 } 4655 4656 /// This is used to control valid status that current MI supports. For example, 4657 /// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG 4658 /// bit on VOP3P. 4659 /// The class can be further extended to recognize support on SEL, NEG, ABS bit 4660 /// for different MI on different arch 4661 class SearchOptions { 4662 private: 4663 bool HasNeg = false; 4664 // Assume all complex pattern of VOP3P have opsel. 4665 bool HasOpsel = true; 4666 4667 public: 4668 SearchOptions(Register Reg, const MachineRegisterInfo &MRI) { 4669 const MachineInstr *MI = MRI.getVRegDef(Reg); 4670 unsigned Opc = MI->getOpcode(); 4671 4672 if (Opc < TargetOpcode::GENERIC_OP_END) { 4673 // Keep same for generic op. 4674 HasNeg = true; 4675 } else if (Opc == TargetOpcode::G_INTRINSIC) { 4676 Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID(); 4677 // Only float point intrinsic has neg & neg_hi bits. 4678 if (IntrinsicID == Intrinsic::amdgcn_fdot2) 4679 HasNeg = true; 4680 } 4681 } 4682 bool checkOptions(SrcStatus Stat) const { 4683 if (!HasNeg && 4684 (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) { 4685 return false; 4686 } 4687 if (!HasOpsel && 4688 (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) { 4689 return false; 4690 } 4691 return true; 4692 } 4693 }; 4694 4695 static SmallVector<std::pair<Register, SrcStatus>> 4696 getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, 4697 int MaxDepth = 3) { 4698 int Depth = 0; 4699 auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI); 4700 SmallVector<std::pair<Register, SrcStatus>> Statlist; 4701 4702 while (Depth <= MaxDepth && Curr.has_value()) { 4703 Depth++; 4704 if (SO.checkOptions(Curr.value().second)) 4705 Statlist.push_back(Curr.value()); 4706 Curr = calcNextStatus(Curr.value(), MRI); 4707 } 4708 4709 return Statlist; 4710 } 4711 4712 static std::pair<Register, SrcStatus> 4713 getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, 4714 int MaxDepth = 3) { 4715 int Depth = 0; 4716 std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME}; 4717 auto Curr = calcNextStatus(LastSameOrNeg, MRI); 4718 4719 while (Depth <= MaxDepth && Curr.has_value()) { 4720 Depth++; 4721 SrcStatus Stat = Curr.value().second; 4722 if (SO.checkOptions(Stat)) { 4723 if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG || 4724 Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG) 4725 LastSameOrNeg = Curr.value(); 4726 } 4727 Curr = calcNextStatus(Curr.value(), MRI); 4728 } 4729 4730 return LastSameOrNeg; 4731 } 4732 4733 static bool isSameBitWidth(Register Reg1, Register Reg2, 4734 const MachineRegisterInfo &MRI) { 4735 unsigned Width1 = MRI.getType(Reg1).getSizeInBits(); 4736 unsigned Width2 = MRI.getType(Reg2).getSizeInBits(); 4737 return Width1 == Width2; 4738 } 4739 4740 static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { 4741 // SrcStatus::IS_LOWER_HALF remain 0. 4742 if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) { 4743 Mods ^= SISrcMods::NEG_HI; 4744 Mods |= SISrcMods::OP_SEL_1; 4745 } else if (HiStat == SrcStatus::IS_UPPER_HALF) 4746 Mods |= SISrcMods::OP_SEL_1; 4747 else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) 4748 Mods ^= SISrcMods::NEG_HI; 4749 else if (HiStat == SrcStatus::IS_HI_NEG) 4750 Mods ^= SISrcMods::NEG_HI; 4751 4752 if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { 4753 Mods ^= SISrcMods::NEG; 4754 Mods |= SISrcMods::OP_SEL_0; 4755 } else if (LoStat == SrcStatus::IS_UPPER_HALF) 4756 Mods |= SISrcMods::OP_SEL_0; 4757 else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG) 4758 Mods |= SISrcMods::NEG; 4759 else if (LoStat == SrcStatus::IS_HI_NEG) 4760 Mods ^= SISrcMods::NEG; 4761 4762 return Mods; 4763 } 4764 4765 static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, 4766 Register RootReg, const SIInstrInfo &TII, 4767 const MachineRegisterInfo &MRI) { 4768 auto IsHalfState = [](SrcStatus S) { 4769 return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG || 4770 S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG; 4771 }; 4772 return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) && 4773 IsHalfState(HiStat); 4774 } 4775 4776 std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl( 4777 Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const { 4778 unsigned Mods = 0; 4779 // No modification if Root type is not form of <2 x Type>. 4780 if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) { 4781 Mods |= SISrcMods::OP_SEL_1; 4782 return {RootReg, Mods}; 4783 } 4784 4785 SearchOptions SO(RootReg, MRI); 4786 4787 std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO); 4788 4789 if (Stat.second == SrcStatus::IS_BOTH_NEG) 4790 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); 4791 else if (Stat.second == SrcStatus::IS_HI_NEG) 4792 Mods ^= SISrcMods::NEG_HI; 4793 else if (Stat.second == SrcStatus::IS_LO_NEG) 4794 Mods ^= SISrcMods::NEG; 4795 4796 MachineInstr *MI = MRI.getVRegDef(Stat.first); 4797 4798 if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 || 4799 (IsDOT && Subtarget->hasDOTOpSelHazard())) { 4800 Mods |= SISrcMods::OP_SEL_1; 4801 return {Stat.first, Mods}; 4802 } 4803 4804 SmallVector<std::pair<Register, SrcStatus>> StatlistHi = 4805 getSrcStats(MI->getOperand(2).getReg(), MRI, SO); 4806 4807 if (StatlistHi.empty()) { 4808 Mods |= SISrcMods::OP_SEL_1; 4809 return {Stat.first, Mods}; 4810 } 4811 4812 SmallVector<std::pair<Register, SrcStatus>> StatlistLo = 4813 getSrcStats(MI->getOperand(1).getReg(), MRI, SO); 4814 4815 if (StatlistLo.empty()) { 4816 Mods |= SISrcMods::OP_SEL_1; 4817 return {Stat.first, Mods}; 4818 } 4819 4820 for (int I = StatlistHi.size() - 1; I >= 0; I--) { 4821 for (int J = StatlistLo.size() - 1; J >= 0; J--) { 4822 if (StatlistHi[I].first == StatlistLo[J].first && 4823 isValidToPack(StatlistHi[I].second, StatlistLo[J].second, 4824 StatlistHi[I].first, RootReg, TII, MRI)) 4825 return {StatlistHi[I].first, 4826 updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)}; 4827 } 4828 } 4829 // Packed instructions do not have abs modifiers. 4830 Mods |= SISrcMods::OP_SEL_1; 4831 4832 return {Stat.first, Mods}; 4833 } 4834 4835 // Removed unused function `getAllKindImm` to eliminate dead code. 4836 4837 static bool checkRB(Register Reg, unsigned int RBNo, 4838 const AMDGPURegisterBankInfo &RBI, 4839 const MachineRegisterInfo &MRI, 4840 const TargetRegisterInfo &TRI) { 4841 const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI); 4842 return RB->getID() == RBNo; 4843 } 4844 4845 // This function is used to get the correct register bank for returned reg. 4846 // Assume: 4847 // 1. VOP3P is always legal for VGPR. 4848 // 2. RootOp's regbank is legal. 4849 // Thus 4850 // 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR. 4851 // 2. If RootOp is VGPR, then NewOp must be VGPR. 4852 static Register getLegalRegBank(Register NewReg, Register RootReg, 4853 const AMDGPURegisterBankInfo &RBI, 4854 MachineRegisterInfo &MRI, 4855 const TargetRegisterInfo &TRI, 4856 const SIInstrInfo &TII) { 4857 // RootOp can only be VGPR or SGPR (some hand written cases such as. 4858 // inst-select-ashr.v2s16.mir::ashr_v2s16_vs). 4859 if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || 4860 checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) 4861 return NewReg; 4862 4863 MachineInstr *MI = MRI.getVRegDef(RootReg); 4864 if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) { 4865 // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp. 4866 return RootReg; 4867 } 4868 4869 MachineBasicBlock *BB = MI->getParent(); 4870 Register DstReg = MRI.cloneVirtualRegister(RootReg); 4871 4872 MachineInstrBuilder MIB = 4873 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) 4874 .addReg(NewReg); 4875 4876 // Only accept VGPR. 4877 return MIB->getOperand(0).getReg(); 4878 } 4879 4880 InstructionSelector::ComplexRendererFns 4881 AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root, 4882 bool IsDOT) const { 4883 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); 4884 Register Reg; 4885 unsigned Mods; 4886 std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT); 4887 4888 Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII); 4889 return {{ 4890 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 4891 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4892 }}; 4893 } 4894 4895 InstructionSelector::ComplexRendererFns 4896 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { 4897 4898 return selectVOP3PRetHelper(Root); 4899 } 4900 4901 InstructionSelector::ComplexRendererFns 4902 AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { 4903 4904 return selectVOP3PRetHelper(Root, true); 4905 } 4906 4907 InstructionSelector::ComplexRendererFns 4908 AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { 4909 // Literal i1 value set in intrinsic, represents SrcMods for the next operand. 4910 // Value is in Imm operand as i1 sign extended to int64_t. 4911 // 1(-1) promotes packed values to signed, 0 treats them as unsigned. 4912 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && 4913 "expected i1 value"); 4914 unsigned Mods = SISrcMods::OP_SEL_1; 4915 if (Root.getImm() == -1) 4916 Mods ^= SISrcMods::NEG; 4917 return {{ 4918 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4919 }}; 4920 } 4921 4922 InstructionSelector::ComplexRendererFns 4923 AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( 4924 MachineOperand &Root) const { 4925 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && 4926 "expected i1 value"); 4927 unsigned Mods = SISrcMods::OP_SEL_1; 4928 if (Root.getImm() != 0) 4929 Mods |= SISrcMods::OP_SEL_0; 4930 4931 return {{ 4932 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 4933 }}; 4934 } 4935 4936 static Register buildRegSequence(SmallVectorImpl<Register> &Elts, 4937 MachineInstr *InsertPt, 4938 MachineRegisterInfo &MRI) { 4939 const TargetRegisterClass *DstRegClass; 4940 switch (Elts.size()) { 4941 case 8: 4942 DstRegClass = &AMDGPU::VReg_256RegClass; 4943 break; 4944 case 4: 4945 DstRegClass = &AMDGPU::VReg_128RegClass; 4946 break; 4947 case 2: 4948 DstRegClass = &AMDGPU::VReg_64RegClass; 4949 break; 4950 default: 4951 llvm_unreachable("unhandled Reg sequence size"); 4952 } 4953 4954 MachineIRBuilder B(*InsertPt); 4955 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE) 4956 .addDef(MRI.createVirtualRegister(DstRegClass)); 4957 for (unsigned i = 0; i < Elts.size(); ++i) { 4958 MIB.addReg(Elts[i]); 4959 MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i)); 4960 } 4961 return MIB->getOperand(0).getReg(); 4962 } 4963 4964 static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, 4965 SmallVectorImpl<Register> &Elts, Register &Src, 4966 MachineInstr *InsertPt, 4967 MachineRegisterInfo &MRI) { 4968 if (ModOpcode == TargetOpcode::G_FNEG) { 4969 Mods |= SISrcMods::NEG; 4970 // Check if all elements also have abs modifier 4971 SmallVector<Register, 8> NegAbsElts; 4972 for (auto El : Elts) { 4973 Register FabsSrc; 4974 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc)))) 4975 break; 4976 NegAbsElts.push_back(FabsSrc); 4977 } 4978 if (Elts.size() != NegAbsElts.size()) { 4979 // Neg 4980 Src = buildRegSequence(Elts, InsertPt, MRI); 4981 } else { 4982 // Neg and Abs 4983 Mods |= SISrcMods::NEG_HI; 4984 Src = buildRegSequence(NegAbsElts, InsertPt, MRI); 4985 } 4986 } else { 4987 assert(ModOpcode == TargetOpcode::G_FABS); 4988 // Abs 4989 Mods |= SISrcMods::NEG_HI; 4990 Src = buildRegSequence(Elts, InsertPt, MRI); 4991 } 4992 } 4993 4994 InstructionSelector::ComplexRendererFns 4995 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const { 4996 Register Src = Root.getReg(); 4997 unsigned Mods = SISrcMods::OP_SEL_1; 4998 SmallVector<Register, 8> EltsF32; 4999 5000 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) { 5001 assert(BV->getNumSources() > 0); 5002 // Based on first element decide which mod we match, neg or abs 5003 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0)); 5004 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) 5005 ? AMDGPU::G_FNEG 5006 : AMDGPU::G_FABS; 5007 for (unsigned i = 0; i < BV->getNumSources(); ++i) { 5008 ElF32 = MRI->getVRegDef(BV->getSourceReg(i)); 5009 if (ElF32->getOpcode() != ModOpcode) 5010 break; 5011 EltsF32.push_back(ElF32->getOperand(1).getReg()); 5012 } 5013 5014 // All elements had ModOpcode modifier 5015 if (BV->getNumSources() == EltsF32.size()) { 5016 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(), 5017 *MRI); 5018 } 5019 } 5020 5021 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5022 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 5023 } 5024 5025 InstructionSelector::ComplexRendererFns 5026 AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const { 5027 Register Src = Root.getReg(); 5028 unsigned Mods = SISrcMods::OP_SEL_1; 5029 SmallVector<Register, 8> EltsV2F16; 5030 5031 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) { 5032 for (unsigned i = 0; i < CV->getNumSources(); ++i) { 5033 Register FNegSrc; 5034 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc)))) 5035 break; 5036 EltsV2F16.push_back(FNegSrc); 5037 } 5038 5039 // All elements had ModOpcode modifier 5040 if (CV->getNumSources() == EltsV2F16.size()) { 5041 Mods |= SISrcMods::NEG; 5042 Mods |= SISrcMods::NEG_HI; 5043 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI); 5044 } 5045 } 5046 5047 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5048 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 5049 } 5050 5051 InstructionSelector::ComplexRendererFns 5052 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const { 5053 Register Src = Root.getReg(); 5054 unsigned Mods = SISrcMods::OP_SEL_1; 5055 SmallVector<Register, 8> EltsV2F16; 5056 5057 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) { 5058 assert(CV->getNumSources() > 0); 5059 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0)); 5060 // Based on first element decide which mod we match, neg or abs 5061 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) 5062 ? AMDGPU::G_FNEG 5063 : AMDGPU::G_FABS; 5064 5065 for (unsigned i = 0; i < CV->getNumSources(); ++i) { 5066 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i)); 5067 if (ElV2F16->getOpcode() != ModOpcode) 5068 break; 5069 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg()); 5070 } 5071 5072 // All elements had ModOpcode modifier 5073 if (CV->getNumSources() == EltsV2F16.size()) { 5074 MachineIRBuilder B(*Root.getParent()); 5075 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(), 5076 *MRI); 5077 } 5078 } 5079 5080 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5081 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}}; 5082 } 5083 5084 InstructionSelector::ComplexRendererFns 5085 AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { 5086 std::optional<FPValueAndVReg> FPValReg; 5087 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) { 5088 if (TII.isInlineConstant(FPValReg->Value)) { 5089 return {{[=](MachineInstrBuilder &MIB) { 5090 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue()); 5091 }}}; 5092 } 5093 // Non-inlineable splat floats should not fall-through for integer immediate 5094 // checks. 5095 return {}; 5096 } 5097 5098 APInt ICst; 5099 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) { 5100 if (TII.isInlineConstant(ICst)) { 5101 return { 5102 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}}; 5103 } 5104 } 5105 5106 return {}; 5107 } 5108 5109 InstructionSelector::ComplexRendererFns 5110 AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const { 5111 Register Src = 5112 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); 5113 unsigned Key = 0; 5114 5115 Register ShiftSrc; 5116 std::optional<ValueAndVReg> ShiftAmt; 5117 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && 5118 MRI->getType(ShiftSrc).getSizeInBits() == 32 && 5119 ShiftAmt->Value.getZExtValue() % 8 == 0) { 5120 Key = ShiftAmt->Value.getZExtValue() / 8; 5121 Src = ShiftSrc; 5122 } 5123 5124 return {{ 5125 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5126 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key 5127 }}; 5128 } 5129 5130 InstructionSelector::ComplexRendererFns 5131 AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { 5132 5133 Register Src = 5134 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); 5135 unsigned Key = 0; 5136 5137 Register ShiftSrc; 5138 std::optional<ValueAndVReg> ShiftAmt; 5139 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) && 5140 MRI->getType(ShiftSrc).getSizeInBits() == 32 && 5141 ShiftAmt->Value.getZExtValue() == 16) { 5142 Src = ShiftSrc; 5143 Key = 1; 5144 } 5145 5146 return {{ 5147 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5148 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key 5149 }}; 5150 } 5151 5152 InstructionSelector::ComplexRendererFns 5153 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { 5154 Register Src; 5155 unsigned Mods; 5156 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 5157 5158 // FIXME: Handle op_sel 5159 return {{ 5160 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 5161 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 5162 }}; 5163 } 5164 5165 // FIXME-TRUE16 remove when fake16 is removed 5166 InstructionSelector::ComplexRendererFns 5167 AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { 5168 Register Src; 5169 unsigned Mods; 5170 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), 5171 /*IsCanonicalizing=*/true, 5172 /*AllowAbs=*/false, 5173 /*OpSel=*/false); 5174 5175 return {{ 5176 [=](MachineInstrBuilder &MIB) { 5177 MIB.addReg( 5178 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true)); 5179 }, 5180 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 5181 }}; 5182 } 5183 5184 InstructionSelector::ComplexRendererFns 5185 AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { 5186 Register Src; 5187 unsigned Mods; 5188 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), 5189 /*IsCanonicalizing=*/true, 5190 /*AllowAbs=*/false, 5191 /*OpSel=*/true); 5192 5193 return {{ 5194 [=](MachineInstrBuilder &MIB) { 5195 MIB.addReg( 5196 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true)); 5197 }, 5198 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods 5199 }}; 5200 } 5201 5202 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root, 5203 Register &Base, 5204 Register *SOffset, 5205 int64_t *Offset) const { 5206 MachineInstr *MI = Root.getParent(); 5207 MachineBasicBlock *MBB = MI->getParent(); 5208 5209 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 5210 // then we can select all ptr + 32-bit offsets. 5211 SmallVector<GEPInfo, 4> AddrInfo; 5212 getAddrModeInfo(*MI, *MRI, AddrInfo); 5213 5214 if (AddrInfo.empty()) 5215 return false; 5216 5217 const GEPInfo &GEPI = AddrInfo[0]; 5218 std::optional<int64_t> EncodedImm; 5219 5220 if (SOffset && Offset) { 5221 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, 5222 /*HasSOffset=*/true); 5223 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm && 5224 AddrInfo.size() > 1) { 5225 const GEPInfo &GEPI2 = AddrInfo[1]; 5226 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) { 5227 if (Register OffsetReg = 5228 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) { 5229 Base = GEPI2.SgprParts[0]; 5230 *SOffset = OffsetReg; 5231 *Offset = *EncodedImm; 5232 if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI)) 5233 return true; 5234 5235 // For unbuffered smem loads, it is illegal for the Immediate Offset 5236 // to be negative if the resulting (Offset + (M0 or SOffset or zero) 5237 // is negative. Handle the case where the Immediate Offset + SOffset 5238 // is negative. 5239 auto SKnown = VT->getKnownBits(*SOffset); 5240 if (*Offset + SKnown.getMinValue().getSExtValue() < 0) 5241 return false; 5242 5243 return true; 5244 } 5245 } 5246 } 5247 return false; 5248 } 5249 5250 EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false, 5251 /*HasSOffset=*/false); 5252 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) { 5253 Base = GEPI.SgprParts[0]; 5254 *Offset = *EncodedImm; 5255 return true; 5256 } 5257 5258 // SGPR offset is unsigned. 5259 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) && 5260 GEPI.Imm != 0) { 5261 // If we make it this far we have a load with an 32-bit immediate offset. 5262 // It is OK to select this using a sgpr offset, because we have already 5263 // failed trying to select this load into one of the _IMM variants since 5264 // the _IMM Patterns are considered before the _SGPR patterns. 5265 Base = GEPI.SgprParts[0]; 5266 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5267 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset) 5268 .addImm(GEPI.Imm); 5269 return true; 5270 } 5271 5272 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) { 5273 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) { 5274 Base = GEPI.SgprParts[0]; 5275 *SOffset = OffsetReg; 5276 return true; 5277 } 5278 } 5279 5280 return false; 5281 } 5282 5283 InstructionSelector::ComplexRendererFns 5284 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { 5285 Register Base; 5286 int64_t Offset; 5287 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset)) 5288 return std::nullopt; 5289 5290 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 5291 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; 5292 } 5293 5294 InstructionSelector::ComplexRendererFns 5295 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const { 5296 SmallVector<GEPInfo, 4> AddrInfo; 5297 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo); 5298 5299 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1) 5300 return std::nullopt; 5301 5302 const GEPInfo &GEPInfo = AddrInfo[0]; 5303 Register PtrReg = GEPInfo.SgprParts[0]; 5304 std::optional<int64_t> EncodedImm = 5305 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm); 5306 if (!EncodedImm) 5307 return std::nullopt; 5308 5309 return {{ 5310 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); }, 5311 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } 5312 }}; 5313 } 5314 5315 InstructionSelector::ComplexRendererFns 5316 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const { 5317 Register Base, SOffset; 5318 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr)) 5319 return std::nullopt; 5320 5321 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 5322 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; 5323 } 5324 5325 InstructionSelector::ComplexRendererFns 5326 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const { 5327 Register Base, SOffset; 5328 int64_t Offset; 5329 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset)) 5330 return std::nullopt; 5331 5332 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); }, 5333 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, 5334 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}}; 5335 } 5336 5337 std::pair<Register, int> 5338 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, 5339 uint64_t FlatVariant) const { 5340 MachineInstr *MI = Root.getParent(); 5341 5342 auto Default = std::pair(Root.getReg(), 0); 5343 5344 if (!STI.hasFlatInstOffsets()) 5345 return Default; 5346 5347 Register PtrBase; 5348 int64_t ConstOffset; 5349 std::tie(PtrBase, ConstOffset) = 5350 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 5351 5352 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch && 5353 !isFlatScratchBaseLegal(Root.getReg()))) 5354 return Default; 5355 5356 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); 5357 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant)) 5358 return Default; 5359 5360 return std::pair(PtrBase, ConstOffset); 5361 } 5362 5363 InstructionSelector::ComplexRendererFns 5364 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const { 5365 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT); 5366 5367 return {{ 5368 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 5369 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 5370 }}; 5371 } 5372 5373 InstructionSelector::ComplexRendererFns 5374 AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const { 5375 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal); 5376 5377 return {{ 5378 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 5379 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 5380 }}; 5381 } 5382 5383 InstructionSelector::ComplexRendererFns 5384 AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { 5385 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch); 5386 5387 return {{ 5388 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); }, 5389 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); }, 5390 }}; 5391 } 5392 5393 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 5394 InstructionSelector::ComplexRendererFns 5395 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { 5396 Register Addr = Root.getReg(); 5397 Register PtrBase; 5398 int64_t ConstOffset; 5399 int64_t ImmOffset = 0; 5400 5401 // Match the immediate offset first, which canonically is moved as low as 5402 // possible. 5403 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 5404 5405 if (ConstOffset != 0) { 5406 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, 5407 SIInstrFlags::FlatGlobal)) { 5408 Addr = PtrBase; 5409 ImmOffset = ConstOffset; 5410 } else { 5411 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI); 5412 if (isSGPR(PtrBaseDef->Reg)) { 5413 if (ConstOffset > 0) { 5414 // Offset is too large. 5415 // 5416 // saddr + large_offset -> saddr + 5417 // (voffset = large_offset & ~MaxOffset) + 5418 // (large_offset & MaxOffset); 5419 int64_t SplitImmOffset, RemainderOffset; 5420 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset( 5421 ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); 5422 5423 if (isUInt<32>(RemainderOffset)) { 5424 MachineInstr *MI = Root.getParent(); 5425 MachineBasicBlock *MBB = MI->getParent(); 5426 Register HighBits = 5427 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5428 5429 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 5430 HighBits) 5431 .addImm(RemainderOffset); 5432 5433 return {{ 5434 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr 5435 [=](MachineInstrBuilder &MIB) { 5436 MIB.addReg(HighBits); 5437 }, // voffset 5438 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, 5439 }}; 5440 } 5441 } 5442 5443 // We are adding a 64 bit SGPR and a constant. If constant bus limit 5444 // is 1 we would need to perform 1 or 2 extra moves for each half of 5445 // the constant and it is better to do a scalar add and then issue a 5446 // single VALU instruction to materialize zero. Otherwise it is less 5447 // instructions to perform VALU adds with immediates or inline literals. 5448 unsigned NumLiterals = 5449 !TII.isInlineConstant(APInt(32, Lo_32(ConstOffset))) + 5450 !TII.isInlineConstant(APInt(32, Hi_32(ConstOffset))); 5451 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) 5452 return std::nullopt; 5453 } 5454 } 5455 } 5456 5457 // Match the variable offset. 5458 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 5459 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 5460 // Look through the SGPR->VGPR copy. 5461 Register SAddr = 5462 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); 5463 5464 if (isSGPR(SAddr)) { 5465 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); 5466 5467 // It's possible voffset is an SGPR here, but the copy to VGPR will be 5468 // inserted later. 5469 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { 5470 return {{[=](MachineInstrBuilder &MIB) { // saddr 5471 MIB.addReg(SAddr); 5472 }, 5473 [=](MachineInstrBuilder &MIB) { // voffset 5474 MIB.addReg(VOffset); 5475 }, 5476 [=](MachineInstrBuilder &MIB) { // offset 5477 MIB.addImm(ImmOffset); 5478 }}}; 5479 } 5480 } 5481 } 5482 5483 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and 5484 // drop this. 5485 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF || 5486 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg)) 5487 return std::nullopt; 5488 5489 // It's cheaper to materialize a single 32-bit zero for vaddr than the two 5490 // moves required to copy a 64-bit SGPR to VGPR. 5491 MachineInstr *MI = Root.getParent(); 5492 MachineBasicBlock *MBB = MI->getParent(); 5493 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5494 5495 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset) 5496 .addImm(0); 5497 5498 return {{ 5499 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr 5500 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset 5501 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 5502 }}; 5503 } 5504 5505 InstructionSelector::ComplexRendererFns 5506 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { 5507 Register Addr = Root.getReg(); 5508 Register PtrBase; 5509 int64_t ConstOffset; 5510 int64_t ImmOffset = 0; 5511 5512 // Match the immediate offset first, which canonically is moved as low as 5513 // possible. 5514 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 5515 5516 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) && 5517 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, 5518 SIInstrFlags::FlatScratch)) { 5519 Addr = PtrBase; 5520 ImmOffset = ConstOffset; 5521 } 5522 5523 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 5524 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 5525 int FI = AddrDef->MI->getOperand(1).getIndex(); 5526 return {{ 5527 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 5528 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 5529 }}; 5530 } 5531 5532 Register SAddr = AddrDef->Reg; 5533 5534 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { 5535 Register LHS = AddrDef->MI->getOperand(1).getReg(); 5536 Register RHS = AddrDef->MI->getOperand(2).getReg(); 5537 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 5538 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI); 5539 5540 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX && 5541 isSGPR(RHSDef->Reg)) { 5542 int FI = LHSDef->MI->getOperand(1).getIndex(); 5543 MachineInstr &I = *Root.getParent(); 5544 MachineBasicBlock *BB = I.getParent(); 5545 const DebugLoc &DL = I.getDebugLoc(); 5546 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 5547 5548 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr) 5549 .addFrameIndex(FI) 5550 .addReg(RHSDef->Reg) 5551 .setOperandDead(3); // Dead scc 5552 } 5553 } 5554 5555 if (!isSGPR(SAddr)) 5556 return std::nullopt; 5557 5558 return {{ 5559 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr 5560 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 5561 }}; 5562 } 5563 5564 // Check whether the flat scratch SVS swizzle bug affects this access. 5565 bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( 5566 Register VAddr, Register SAddr, uint64_t ImmOffset) const { 5567 if (!Subtarget->hasFlatScratchSVSSwizzleBug()) 5568 return false; 5569 5570 // The bug affects the swizzling of SVS accesses if there is any carry out 5571 // from the two low order bits (i.e. from bit 1 into bit 2) when adding 5572 // voffset to (soffset + inst_offset). 5573 auto VKnown = VT->getKnownBits(VAddr); 5574 auto SKnown = KnownBits::add(VT->getKnownBits(SAddr), 5575 KnownBits::makeConstant(APInt(32, ImmOffset))); 5576 uint64_t VMax = VKnown.getMaxValue().getZExtValue(); 5577 uint64_t SMax = SKnown.getMaxValue().getZExtValue(); 5578 return (VMax & 3) + (SMax & 3) >= 4; 5579 } 5580 5581 InstructionSelector::ComplexRendererFns 5582 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { 5583 Register Addr = Root.getReg(); 5584 Register PtrBase; 5585 int64_t ConstOffset; 5586 int64_t ImmOffset = 0; 5587 5588 // Match the immediate offset first, which canonically is moved as low as 5589 // possible. 5590 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); 5591 5592 Register OrigAddr = Addr; 5593 if (ConstOffset != 0 && 5594 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, 5595 SIInstrFlags::FlatScratch)) { 5596 Addr = PtrBase; 5597 ImmOffset = ConstOffset; 5598 } 5599 5600 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); 5601 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) 5602 return std::nullopt; 5603 5604 Register RHS = AddrDef->MI->getOperand(2).getReg(); 5605 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) 5606 return std::nullopt; 5607 5608 Register LHS = AddrDef->MI->getOperand(1).getReg(); 5609 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); 5610 5611 if (OrigAddr != Addr) { 5612 if (!isFlatScratchBaseLegalSVImm(OrigAddr)) 5613 return std::nullopt; 5614 } else { 5615 if (!isFlatScratchBaseLegalSV(OrigAddr)) 5616 return std::nullopt; 5617 } 5618 5619 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) 5620 return std::nullopt; 5621 5622 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { 5623 int FI = LHSDef->MI->getOperand(1).getIndex(); 5624 return {{ 5625 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 5626 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr 5627 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 5628 }}; 5629 } 5630 5631 if (!isSGPR(LHS)) 5632 return std::nullopt; 5633 5634 return {{ 5635 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr 5636 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr 5637 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset 5638 }}; 5639 } 5640 5641 InstructionSelector::ComplexRendererFns 5642 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { 5643 MachineInstr *MI = Root.getParent(); 5644 MachineBasicBlock *MBB = MI->getParent(); 5645 MachineFunction *MF = MBB->getParent(); 5646 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 5647 5648 int64_t Offset = 0; 5649 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) && 5650 Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) { 5651 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5652 5653 // TODO: Should this be inside the render function? The iterator seems to 5654 // move. 5655 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget); 5656 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), 5657 HighBits) 5658 .addImm(Offset & ~MaxOffset); 5659 5660 return {{[=](MachineInstrBuilder &MIB) { // rsrc 5661 MIB.addReg(Info->getScratchRSrcReg()); 5662 }, 5663 [=](MachineInstrBuilder &MIB) { // vaddr 5664 MIB.addReg(HighBits); 5665 }, 5666 [=](MachineInstrBuilder &MIB) { // soffset 5667 // Use constant zero for soffset and rely on eliminateFrameIndex 5668 // to choose the appropriate frame register if need be. 5669 MIB.addImm(0); 5670 }, 5671 [=](MachineInstrBuilder &MIB) { // offset 5672 MIB.addImm(Offset & MaxOffset); 5673 }}}; 5674 } 5675 5676 assert(Offset == 0 || Offset == -1); 5677 5678 // Try to fold a frame index directly into the MUBUF vaddr field, and any 5679 // offsets. 5680 std::optional<int> FI; 5681 Register VAddr = Root.getReg(); 5682 5683 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 5684 Register PtrBase; 5685 int64_t ConstOffset; 5686 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI); 5687 if (ConstOffset != 0) { 5688 if (TII.isLegalMUBUFImmOffset(ConstOffset) && 5689 (!STI.privateMemoryResourceIsRangeChecked() || 5690 VT->signBitIsZero(PtrBase))) { 5691 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); 5692 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) 5693 FI = PtrBaseDef->getOperand(1).getIndex(); 5694 else 5695 VAddr = PtrBase; 5696 Offset = ConstOffset; 5697 } 5698 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) { 5699 FI = RootDef->getOperand(1).getIndex(); 5700 } 5701 5702 return {{[=](MachineInstrBuilder &MIB) { // rsrc 5703 MIB.addReg(Info->getScratchRSrcReg()); 5704 }, 5705 [=](MachineInstrBuilder &MIB) { // vaddr 5706 if (FI) 5707 MIB.addFrameIndex(*FI); 5708 else 5709 MIB.addReg(VAddr); 5710 }, 5711 [=](MachineInstrBuilder &MIB) { // soffset 5712 // Use constant zero for soffset and rely on eliminateFrameIndex 5713 // to choose the appropriate frame register if need be. 5714 MIB.addImm(0); 5715 }, 5716 [=](MachineInstrBuilder &MIB) { // offset 5717 MIB.addImm(Offset); 5718 }}}; 5719 } 5720 5721 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, 5722 int64_t Offset) const { 5723 if (!isUInt<16>(Offset)) 5724 return false; 5725 5726 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 5727 return true; 5728 5729 // On Southern Islands instruction with a negative base value and an offset 5730 // don't seem to work. 5731 return VT->signBitIsZero(Base); 5732 } 5733 5734 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, 5735 int64_t Offset1, 5736 unsigned Size) const { 5737 if (Offset0 % Size != 0 || Offset1 % Size != 0) 5738 return false; 5739 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) 5740 return false; 5741 5742 if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) 5743 return true; 5744 5745 // On Southern Islands instruction with a negative base value and an offset 5746 // don't seem to work. 5747 return VT->signBitIsZero(Base); 5748 } 5749 5750 // Return whether the operation has NoUnsignedWrap property. 5751 static bool isNoUnsignedWrap(MachineInstr *Addr) { 5752 return Addr->getOpcode() == TargetOpcode::G_OR || 5753 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD && 5754 Addr->getFlag(MachineInstr::NoUWrap)); 5755 } 5756 5757 // Check that the base address of flat scratch load/store in the form of `base + 5758 // offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware 5759 // requirement). We always treat the first operand as the base address here. 5760 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const { 5761 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 5762 5763 if (isNoUnsignedWrap(AddrMI)) 5764 return true; 5765 5766 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5767 // values. 5768 if (STI.hasSignedScratchOffsets()) 5769 return true; 5770 5771 Register LHS = AddrMI->getOperand(1).getReg(); 5772 Register RHS = AddrMI->getOperand(2).getReg(); 5773 5774 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { 5775 std::optional<ValueAndVReg> RhsValReg = 5776 getIConstantVRegValWithLookThrough(RHS, *MRI); 5777 // If the immediate offset is negative and within certain range, the base 5778 // address cannot also be negative. If the base is also negative, the sum 5779 // would be either negative or much larger than the valid range of scratch 5780 // memory a thread can access. 5781 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 && 5782 RhsValReg->Value.getSExtValue() > -0x40000000) 5783 return true; 5784 } 5785 5786 return VT->signBitIsZero(LHS); 5787 } 5788 5789 // Check address value in SGPR/VGPR are legal for flat scratch in the form 5790 // of: SGPR + VGPR. 5791 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const { 5792 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 5793 5794 if (isNoUnsignedWrap(AddrMI)) 5795 return true; 5796 5797 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5798 // values. 5799 if (STI.hasSignedScratchOffsets()) 5800 return true; 5801 5802 Register LHS = AddrMI->getOperand(1).getReg(); 5803 Register RHS = AddrMI->getOperand(2).getReg(); 5804 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS); 5805 } 5806 5807 // Check address value in SGPR/VGPR are legal for flat scratch in the form 5808 // of: SGPR + VGPR + Imm. 5809 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm( 5810 Register Addr) const { 5811 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5812 // values. 5813 if (STI.hasSignedScratchOffsets()) 5814 return true; 5815 5816 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI); 5817 Register Base = AddrMI->getOperand(1).getReg(); 5818 std::optional<DefinitionAndSourceRegister> BaseDef = 5819 getDefSrcRegIgnoringCopies(Base, *MRI); 5820 std::optional<ValueAndVReg> RHSOffset = 5821 getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI); 5822 assert(RHSOffset); 5823 5824 // If the immediate offset is negative and within certain range, the base 5825 // address cannot also be negative. If the base is also negative, the sum 5826 // would be either negative or much larger than the valid range of scratch 5827 // memory a thread can access. 5828 if (isNoUnsignedWrap(BaseDef->MI) && 5829 (isNoUnsignedWrap(AddrMI) || 5830 (RHSOffset->Value.getSExtValue() < 0 && 5831 RHSOffset->Value.getSExtValue() > -0x40000000))) 5832 return true; 5833 5834 Register LHS = BaseDef->MI->getOperand(1).getReg(); 5835 Register RHS = BaseDef->MI->getOperand(2).getReg(); 5836 return VT->signBitIsZero(RHS) && VT->signBitIsZero(LHS); 5837 } 5838 5839 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, 5840 unsigned ShAmtBits) const { 5841 assert(MI.getOpcode() == TargetOpcode::G_AND); 5842 5843 std::optional<APInt> RHS = 5844 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI); 5845 if (!RHS) 5846 return false; 5847 5848 if (RHS->countr_one() >= ShAmtBits) 5849 return true; 5850 5851 const APInt &LHSKnownZeros = VT->getKnownZeroes(MI.getOperand(1).getReg()); 5852 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits; 5853 } 5854 5855 InstructionSelector::ComplexRendererFns 5856 AMDGPUInstructionSelector::selectMUBUFScratchOffset( 5857 MachineOperand &Root) const { 5858 Register Reg = Root.getReg(); 5859 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 5860 5861 std::optional<DefinitionAndSourceRegister> Def = 5862 getDefSrcRegIgnoringCopies(Reg, *MRI); 5863 assert(Def && "this shouldn't be an optional result"); 5864 Reg = Def->Reg; 5865 5866 if (Register WaveBase = getWaveAddress(Def->MI)) { 5867 return {{ 5868 [=](MachineInstrBuilder &MIB) { // rsrc 5869 MIB.addReg(Info->getScratchRSrcReg()); 5870 }, 5871 [=](MachineInstrBuilder &MIB) { // soffset 5872 MIB.addReg(WaveBase); 5873 }, 5874 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset 5875 }}; 5876 } 5877 5878 int64_t Offset = 0; 5879 5880 // FIXME: Copy check is a hack 5881 Register BasePtr; 5882 if (mi_match(Reg, *MRI, 5883 m_GPtrAdd(m_Reg(BasePtr), 5884 m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) { 5885 if (!TII.isLegalMUBUFImmOffset(Offset)) 5886 return {}; 5887 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI); 5888 Register WaveBase = getWaveAddress(BasePtrDef); 5889 if (!WaveBase) 5890 return {}; 5891 5892 return {{ 5893 [=](MachineInstrBuilder &MIB) { // rsrc 5894 MIB.addReg(Info->getScratchRSrcReg()); 5895 }, 5896 [=](MachineInstrBuilder &MIB) { // soffset 5897 MIB.addReg(WaveBase); 5898 }, 5899 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 5900 }}; 5901 } 5902 5903 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) || 5904 !TII.isLegalMUBUFImmOffset(Offset)) 5905 return {}; 5906 5907 return {{ 5908 [=](MachineInstrBuilder &MIB) { // rsrc 5909 MIB.addReg(Info->getScratchRSrcReg()); 5910 }, 5911 [=](MachineInstrBuilder &MIB) { // soffset 5912 MIB.addImm(0); 5913 }, 5914 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset 5915 }}; 5916 } 5917 5918 std::pair<Register, unsigned> 5919 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { 5920 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 5921 int64_t ConstAddr = 0; 5922 5923 Register PtrBase; 5924 int64_t Offset; 5925 std::tie(PtrBase, Offset) = 5926 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 5927 5928 if (Offset) { 5929 if (isDSOffsetLegal(PtrBase, Offset)) { 5930 // (add n0, c0) 5931 return std::pair(PtrBase, Offset); 5932 } 5933 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 5934 // TODO 5935 5936 5937 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 5938 // TODO 5939 5940 } 5941 5942 return std::pair(Root.getReg(), 0); 5943 } 5944 5945 InstructionSelector::ComplexRendererFns 5946 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { 5947 Register Reg; 5948 unsigned Offset; 5949 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); 5950 return {{ 5951 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 5952 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } 5953 }}; 5954 } 5955 5956 InstructionSelector::ComplexRendererFns 5957 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { 5958 return selectDSReadWrite2(Root, 4); 5959 } 5960 5961 InstructionSelector::ComplexRendererFns 5962 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { 5963 return selectDSReadWrite2(Root, 8); 5964 } 5965 5966 InstructionSelector::ComplexRendererFns 5967 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, 5968 unsigned Size) const { 5969 Register Reg; 5970 unsigned Offset; 5971 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size); 5972 return {{ 5973 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, 5974 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, 5975 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); } 5976 }}; 5977 } 5978 5979 std::pair<Register, unsigned> 5980 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, 5981 unsigned Size) const { 5982 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); 5983 int64_t ConstAddr = 0; 5984 5985 Register PtrBase; 5986 int64_t Offset; 5987 std::tie(PtrBase, Offset) = 5988 getPtrBaseWithConstantOffset(Root.getReg(), *MRI); 5989 5990 if (Offset) { 5991 int64_t OffsetValue0 = Offset; 5992 int64_t OffsetValue1 = Offset + Size; 5993 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) { 5994 // (add n0, c0) 5995 return std::pair(PtrBase, OffsetValue0 / Size); 5996 } 5997 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { 5998 // TODO 5999 6000 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { 6001 // TODO 6002 6003 } 6004 6005 return std::pair(Root.getReg(), 0); 6006 } 6007 6008 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 6009 /// the base value with the constant offset. There may be intervening copies 6010 /// between \p Root and the identified constant. Returns \p Root, 0 if this does 6011 /// not match the pattern. 6012 std::pair<Register, int64_t> 6013 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( 6014 Register Root, const MachineRegisterInfo &MRI) const { 6015 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); 6016 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) 6017 return {Root, 0}; 6018 6019 MachineOperand &RHS = RootI->getOperand(2); 6020 std::optional<ValueAndVReg> MaybeOffset = 6021 getIConstantVRegValWithLookThrough(RHS.getReg(), MRI); 6022 if (!MaybeOffset) 6023 return {Root, 0}; 6024 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()}; 6025 } 6026 6027 static void addZeroImm(MachineInstrBuilder &MIB) { 6028 MIB.addImm(0); 6029 } 6030 6031 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 6032 /// BasePtr is not valid, a null base pointer will be used. 6033 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, 6034 uint32_t FormatLo, uint32_t FormatHi, 6035 Register BasePtr) { 6036 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6037 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6038 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 6039 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 6040 6041 B.buildInstr(AMDGPU::S_MOV_B32) 6042 .addDef(RSrc2) 6043 .addImm(FormatLo); 6044 B.buildInstr(AMDGPU::S_MOV_B32) 6045 .addDef(RSrc3) 6046 .addImm(FormatHi); 6047 6048 // Build the half of the subregister with the constants before building the 6049 // full 128-bit register. If we are building multiple resource descriptors, 6050 // this will allow CSEing of the 2-component register. 6051 B.buildInstr(AMDGPU::REG_SEQUENCE) 6052 .addDef(RSrcHi) 6053 .addReg(RSrc2) 6054 .addImm(AMDGPU::sub0) 6055 .addReg(RSrc3) 6056 .addImm(AMDGPU::sub1); 6057 6058 Register RSrcLo = BasePtr; 6059 if (!BasePtr) { 6060 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 6061 B.buildInstr(AMDGPU::S_MOV_B64) 6062 .addDef(RSrcLo) 6063 .addImm(0); 6064 } 6065 6066 B.buildInstr(AMDGPU::REG_SEQUENCE) 6067 .addDef(RSrc) 6068 .addReg(RSrcLo) 6069 .addImm(AMDGPU::sub0_sub1) 6070 .addReg(RSrcHi) 6071 .addImm(AMDGPU::sub2_sub3); 6072 6073 return RSrc; 6074 } 6075 6076 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 6077 const SIInstrInfo &TII, Register BasePtr) { 6078 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 6079 6080 // FIXME: Why are half the "default" bits ignored based on the addressing 6081 // mode? 6082 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr); 6083 } 6084 6085 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, 6086 const SIInstrInfo &TII, Register BasePtr) { 6087 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat(); 6088 6089 // FIXME: Why are half the "default" bits ignored based on the addressing 6090 // mode? 6091 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr); 6092 } 6093 6094 AMDGPUInstructionSelector::MUBUFAddressData 6095 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const { 6096 MUBUFAddressData Data; 6097 Data.N0 = Src; 6098 6099 Register PtrBase; 6100 int64_t Offset; 6101 6102 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI); 6103 if (isUInt<32>(Offset)) { 6104 Data.N0 = PtrBase; 6105 Data.Offset = Offset; 6106 } 6107 6108 if (MachineInstr *InputAdd 6109 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) { 6110 Data.N2 = InputAdd->getOperand(1).getReg(); 6111 Data.N3 = InputAdd->getOperand(2).getReg(); 6112 6113 // FIXME: Need to fix extra SGPR->VGPRcopies inserted 6114 // FIXME: Don't know this was defined by operand 0 6115 // 6116 // TODO: Remove this when we have copy folding optimizations after 6117 // RegBankSelect. 6118 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg(); 6119 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg(); 6120 } 6121 6122 return Data; 6123 } 6124 6125 /// Return if the addr64 mubuf mode should be used for the given address. 6126 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const { 6127 // (ptr_add N2, N3) -> addr64, or 6128 // (ptr_add (ptr_add N2, N3), C1) -> addr64 6129 if (Addr.N2) 6130 return true; 6131 6132 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI); 6133 return N0Bank->getID() == AMDGPU::VGPRRegBankID; 6134 } 6135 6136 /// Split an immediate offset \p ImmOffset depending on whether it fits in the 6137 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 6138 /// component. 6139 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset( 6140 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const { 6141 if (TII.isLegalMUBUFImmOffset(ImmOffset)) 6142 return; 6143 6144 // Illegal offset, store it in soffset. 6145 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6146 B.buildInstr(AMDGPU::S_MOV_B32) 6147 .addDef(SOffset) 6148 .addImm(ImmOffset); 6149 ImmOffset = 0; 6150 } 6151 6152 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( 6153 MachineOperand &Root, Register &VAddr, Register &RSrcReg, 6154 Register &SOffset, int64_t &Offset) const { 6155 // FIXME: Predicates should stop this from reaching here. 6156 // addr64 bit was removed for volcanic islands. 6157 if (!STI.hasAddr64() || STI.useFlatForGlobal()) 6158 return false; 6159 6160 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 6161 if (!shouldUseAddr64(AddrData)) 6162 return false; 6163 6164 Register N0 = AddrData.N0; 6165 Register N2 = AddrData.N2; 6166 Register N3 = AddrData.N3; 6167 Offset = AddrData.Offset; 6168 6169 // Base pointer for the SRD. 6170 Register SRDPtr; 6171 6172 if (N2) { 6173 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 6174 assert(N3); 6175 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 6176 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the 6177 // addr64, and construct the default resource from a 0 address. 6178 VAddr = N0; 6179 } else { 6180 SRDPtr = N3; 6181 VAddr = N2; 6182 } 6183 } else { 6184 // N2 is not divergent. 6185 SRDPtr = N2; 6186 VAddr = N3; 6187 } 6188 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) { 6189 // Use the default null pointer in the resource 6190 VAddr = N0; 6191 } else { 6192 // N0 -> offset, or 6193 // (N0 + C1) -> offset 6194 SRDPtr = N0; 6195 } 6196 6197 MachineIRBuilder B(*Root.getParent()); 6198 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); 6199 splitIllegalMUBUFOffset(B, SOffset, Offset); 6200 return true; 6201 } 6202 6203 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( 6204 MachineOperand &Root, Register &RSrcReg, Register &SOffset, 6205 int64_t &Offset) const { 6206 6207 // FIXME: Pattern should not reach here. 6208 if (STI.useFlatForGlobal()) 6209 return false; 6210 6211 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); 6212 if (shouldUseAddr64(AddrData)) 6213 return false; 6214 6215 // N0 -> offset, or 6216 // (N0 + C1) -> offset 6217 Register SRDPtr = AddrData.N0; 6218 Offset = AddrData.Offset; 6219 6220 // TODO: Look through extensions for 32-bit soffset. 6221 MachineIRBuilder B(*Root.getParent()); 6222 6223 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); 6224 splitIllegalMUBUFOffset(B, SOffset, Offset); 6225 return true; 6226 } 6227 6228 InstructionSelector::ComplexRendererFns 6229 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { 6230 Register VAddr; 6231 Register RSrcReg; 6232 Register SOffset; 6233 int64_t Offset = 0; 6234 6235 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) 6236 return {}; 6237 6238 // FIXME: Use defaulted operands for trailing 0s and remove from the complex 6239 // pattern. 6240 return {{ 6241 [=](MachineInstrBuilder &MIB) { // rsrc 6242 MIB.addReg(RSrcReg); 6243 }, 6244 [=](MachineInstrBuilder &MIB) { // vaddr 6245 MIB.addReg(VAddr); 6246 }, 6247 [=](MachineInstrBuilder &MIB) { // soffset 6248 if (SOffset) 6249 MIB.addReg(SOffset); 6250 else if (STI.hasRestrictedSOffset()) 6251 MIB.addReg(AMDGPU::SGPR_NULL); 6252 else 6253 MIB.addImm(0); 6254 }, 6255 [=](MachineInstrBuilder &MIB) { // offset 6256 MIB.addImm(Offset); 6257 }, 6258 addZeroImm, // cpol 6259 addZeroImm, // tfe 6260 addZeroImm // swz 6261 }}; 6262 } 6263 6264 InstructionSelector::ComplexRendererFns 6265 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { 6266 Register RSrcReg; 6267 Register SOffset; 6268 int64_t Offset = 0; 6269 6270 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) 6271 return {}; 6272 6273 return {{ 6274 [=](MachineInstrBuilder &MIB) { // rsrc 6275 MIB.addReg(RSrcReg); 6276 }, 6277 [=](MachineInstrBuilder &MIB) { // soffset 6278 if (SOffset) 6279 MIB.addReg(SOffset); 6280 else if (STI.hasRestrictedSOffset()) 6281 MIB.addReg(AMDGPU::SGPR_NULL); 6282 else 6283 MIB.addImm(0); 6284 }, 6285 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset 6286 addZeroImm, // cpol 6287 addZeroImm, // tfe 6288 addZeroImm, // swz 6289 }}; 6290 } 6291 6292 InstructionSelector::ComplexRendererFns 6293 AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const { 6294 6295 Register SOffset = Root.getReg(); 6296 6297 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt())) 6298 SOffset = AMDGPU::SGPR_NULL; 6299 6300 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}}; 6301 } 6302 6303 /// Get an immediate that must be 32-bits, and treated as zero extended. 6304 static std::optional<uint64_t> 6305 getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) { 6306 // getIConstantVRegVal sexts any values, so see if that matters. 6307 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI); 6308 if (!OffsetVal || !isInt<32>(*OffsetVal)) 6309 return std::nullopt; 6310 return Lo_32(*OffsetVal); 6311 } 6312 6313 InstructionSelector::ComplexRendererFns 6314 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const { 6315 std::optional<uint64_t> OffsetVal = 6316 Root.isImm() ? Root.getImm() : getConstantZext32Val(Root.getReg(), *MRI); 6317 if (!OffsetVal) 6318 return {}; 6319 6320 std::optional<int64_t> EncodedImm = 6321 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true); 6322 if (!EncodedImm) 6323 return {}; 6324 6325 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 6326 } 6327 6328 InstructionSelector::ComplexRendererFns 6329 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const { 6330 assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS); 6331 6332 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI); 6333 if (!OffsetVal) 6334 return {}; 6335 6336 std::optional<int64_t> EncodedImm = 6337 AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal); 6338 if (!EncodedImm) 6339 return {}; 6340 6341 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }}; 6342 } 6343 6344 InstructionSelector::ComplexRendererFns 6345 AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { 6346 // Match the (soffset + offset) pair as a 32-bit register base and 6347 // an immediate offset. 6348 Register SOffset; 6349 unsigned Offset; 6350 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset( 6351 *MRI, Root.getReg(), VT, /*CheckNUW*/ true); 6352 if (!SOffset) 6353 return std::nullopt; 6354 6355 std::optional<int64_t> EncodedOffset = 6356 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true); 6357 if (!EncodedOffset) 6358 return std::nullopt; 6359 6360 assert(MRI->getType(SOffset) == LLT::scalar(32)); 6361 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }, 6362 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}}; 6363 } 6364 6365 std::pair<Register, unsigned> 6366 AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, 6367 bool &Matched) const { 6368 Matched = false; 6369 6370 Register Src; 6371 unsigned Mods; 6372 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); 6373 6374 if (mi_match(Src, *MRI, m_GFPExt(m_Reg(Src)))) { 6375 assert(MRI->getType(Src) == LLT::scalar(16)); 6376 6377 // Only change Src if src modifier could be gained. In such cases new Src 6378 // could be sgpr but this does not violate constant bus restriction for 6379 // instruction that is being selected. 6380 Src = stripBitCast(Src, *MRI); 6381 6382 const auto CheckAbsNeg = [&]() { 6383 // Be careful about folding modifiers if we already have an abs. fneg is 6384 // applied last, so we don't want to apply an earlier fneg. 6385 if ((Mods & SISrcMods::ABS) == 0) { 6386 unsigned ModsTmp; 6387 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src); 6388 6389 if ((ModsTmp & SISrcMods::NEG) != 0) 6390 Mods ^= SISrcMods::NEG; 6391 6392 if ((ModsTmp & SISrcMods::ABS) != 0) 6393 Mods |= SISrcMods::ABS; 6394 } 6395 }; 6396 6397 CheckAbsNeg(); 6398 6399 // op_sel/op_sel_hi decide the source type and source. 6400 // If the source's op_sel_hi is set, it indicates to do a conversion from 6401 // fp16. If the sources's op_sel is set, it picks the high half of the 6402 // source register. 6403 6404 Mods |= SISrcMods::OP_SEL_1; 6405 6406 if (isExtractHiElt(*MRI, Src, Src)) { 6407 Mods |= SISrcMods::OP_SEL_0; 6408 CheckAbsNeg(); 6409 } 6410 6411 Matched = true; 6412 } 6413 6414 return {Src, Mods}; 6415 } 6416 6417 InstructionSelector::ComplexRendererFns 6418 AMDGPUInstructionSelector::selectVOP3PMadMixModsExt( 6419 MachineOperand &Root) const { 6420 Register Src; 6421 unsigned Mods; 6422 bool Matched; 6423 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); 6424 if (!Matched) 6425 return {}; 6426 6427 return {{ 6428 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 6429 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 6430 }}; 6431 } 6432 6433 InstructionSelector::ComplexRendererFns 6434 AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { 6435 Register Src; 6436 unsigned Mods; 6437 bool Matched; 6438 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); 6439 6440 return {{ 6441 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, 6442 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods 6443 }}; 6444 } 6445 6446 bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst( 6447 MachineInstr &I, Intrinsic::ID IntrID) const { 6448 MachineBasicBlock *MBB = I.getParent(); 6449 const DebugLoc &DL = I.getDebugLoc(); 6450 Register CCReg = I.getOperand(0).getReg(); 6451 6452 // Set SCC to true, in case the barrier instruction gets converted to a NOP. 6453 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0); 6454 6455 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM)) 6456 .addImm(I.getOperand(2).getImm()); 6457 6458 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC); 6459 6460 I.eraseFromParent(); 6461 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass, 6462 *MRI); 6463 } 6464 6465 bool AMDGPUInstructionSelector::selectSGetBarrierState( 6466 MachineInstr &I, Intrinsic::ID IntrID) const { 6467 MachineBasicBlock *MBB = I.getParent(); 6468 const DebugLoc &DL = I.getDebugLoc(); 6469 MachineOperand BarOp = I.getOperand(2); 6470 std::optional<int64_t> BarValImm = 6471 getIConstantVRegSExtVal(BarOp.getReg(), *MRI); 6472 6473 if (!BarValImm) { 6474 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 6475 .addReg(BarOp.getReg()); 6476 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); 6477 } 6478 MachineInstrBuilder MIB; 6479 unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM 6480 : AMDGPU::S_GET_BARRIER_STATE_M0; 6481 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); 6482 6483 auto DstReg = I.getOperand(0).getReg(); 6484 const TargetRegisterClass *DstRC = 6485 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 6486 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 6487 return false; 6488 MIB.addDef(DstReg); 6489 if (BarValImm) { 6490 MIB.addImm(*BarValImm); 6491 } 6492 I.eraseFromParent(); 6493 return true; 6494 } 6495 6496 unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) { 6497 if (HasInlineConst) { 6498 switch (IntrID) { 6499 default: 6500 llvm_unreachable("not a named barrier op"); 6501 case Intrinsic::amdgcn_s_get_named_barrier_state: 6502 return AMDGPU::S_GET_BARRIER_STATE_IMM; 6503 }; 6504 } else { 6505 switch (IntrID) { 6506 default: 6507 llvm_unreachable("not a named barrier op"); 6508 case Intrinsic::amdgcn_s_get_named_barrier_state: 6509 return AMDGPU::S_GET_BARRIER_STATE_M0; 6510 }; 6511 } 6512 } 6513 6514 bool AMDGPUInstructionSelector::selectNamedBarrierInit( 6515 MachineInstr &I, Intrinsic::ID IntrID) const { 6516 MachineBasicBlock *MBB = I.getParent(); 6517 const DebugLoc &DL = I.getDebugLoc(); 6518 MachineOperand BarOp = I.getOperand(1); 6519 MachineOperand CntOp = I.getOperand(2); 6520 6521 // BarID = (BarOp >> 4) & 0x3F 6522 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6523 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0) 6524 .add(BarOp) 6525 .addImm(4u) 6526 .setOperandDead(3); // Dead scc 6527 6528 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6529 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1) 6530 .addReg(TmpReg0) 6531 .addImm(0x3F) 6532 .setOperandDead(3); // Dead scc 6533 6534 // MO = ((CntOp & 0x3F) << shAmt) | BarID 6535 Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6536 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2) 6537 .add(CntOp) 6538 .addImm(0x3F) 6539 .setOperandDead(3); // Dead scc 6540 6541 Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6542 constexpr unsigned ShAmt = 16; 6543 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3) 6544 .addReg(TmpReg2) 6545 .addImm(ShAmt) 6546 .setOperandDead(3); // Dead scc 6547 6548 Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6549 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4) 6550 .addReg(TmpReg1) 6551 .addReg(TmpReg3) 6552 .setOperandDead(3); // Dead scc; 6553 6554 auto CopyMIB = 6555 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4); 6556 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); 6557 6558 MachineInstrBuilder MIB; 6559 MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0)); 6560 6561 I.eraseFromParent(); 6562 return true; 6563 } 6564 6565 bool AMDGPUInstructionSelector::selectNamedBarrierInst( 6566 MachineInstr &I, Intrinsic::ID IntrID) const { 6567 MachineBasicBlock *MBB = I.getParent(); 6568 const DebugLoc &DL = I.getDebugLoc(); 6569 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state 6570 ? I.getOperand(2) 6571 : I.getOperand(1); 6572 std::optional<int64_t> BarValImm = 6573 getIConstantVRegSExtVal(BarOp.getReg(), *MRI); 6574 6575 if (!BarValImm) { 6576 // BarID = (BarOp >> 4) & 0x3F 6577 Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6578 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0) 6579 .addReg(BarOp.getReg()) 6580 .addImm(4u) 6581 .setOperandDead(3); // Dead scc; 6582 6583 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 6584 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1) 6585 .addReg(TmpReg0) 6586 .addImm(0x3F) 6587 .setOperandDead(3); // Dead scc; 6588 6589 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) 6590 .addReg(TmpReg1); 6591 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI); 6592 } 6593 6594 MachineInstrBuilder MIB; 6595 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID); 6596 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc)); 6597 6598 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) { 6599 auto DstReg = I.getOperand(0).getReg(); 6600 const TargetRegisterClass *DstRC = 6601 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI); 6602 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) 6603 return false; 6604 MIB.addDef(DstReg); 6605 } 6606 6607 if (BarValImm) { 6608 auto BarId = ((*BarValImm) >> 4) & 0x3F; 6609 MIB.addImm(BarId); 6610 } 6611 6612 I.eraseFromParent(); 6613 return true; 6614 } 6615 6616 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, 6617 const MachineInstr &MI, 6618 int OpIdx) const { 6619 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6620 "Expected G_CONSTANT"); 6621 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue()); 6622 } 6623 6624 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB, 6625 const MachineInstr &MI, 6626 int OpIdx) const { 6627 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6628 "Expected G_CONSTANT"); 6629 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue()); 6630 } 6631 6632 void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB, 6633 const MachineInstr &MI, 6634 int OpIdx) const { 6635 const MachineOperand &Op = MI.getOperand(1); 6636 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1); 6637 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue()); 6638 } 6639 6640 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, 6641 const MachineInstr &MI, 6642 int OpIdx) const { 6643 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && 6644 "Expected G_CONSTANT"); 6645 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount()); 6646 } 6647 6648 /// This only really exists to satisfy DAG type checking machinery, so is a 6649 /// no-op here. 6650 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, 6651 const MachineInstr &MI, 6652 int OpIdx) const { 6653 const MachineOperand &Op = MI.getOperand(OpIdx); 6654 int64_t Imm; 6655 if (Op.isReg() && mi_match(Op.getReg(), *MRI, m_ICst(Imm))) 6656 MIB.addImm(Imm); 6657 else 6658 MIB.addImm(Op.getImm()); 6659 } 6660 6661 void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB, 6662 const MachineInstr &MI, 6663 int OpIdx) const { 6664 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0); 6665 } 6666 6667 void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB, 6668 const MachineInstr &MI, 6669 int OpIdx) const { 6670 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6671 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6672 } 6673 6674 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0( 6675 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6676 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6677 MIB.addImm( 6678 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6679 } 6680 6681 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1( 6682 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6683 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6684 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2) 6685 ? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) 6686 : (int64_t)SISrcMods::DST_OP_SEL); 6687 } 6688 6689 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0( 6690 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6691 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6692 MIB.addImm( 6693 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6694 } 6695 6696 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1( 6697 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6698 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6699 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1) 6700 ? (int64_t)(SISrcMods::OP_SEL_0) 6701 : 0); 6702 } 6703 6704 void AMDGPUInstructionSelector::renderDstSelToOpSelXForm( 6705 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6706 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6707 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL) 6708 : 0); 6709 } 6710 6711 void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm( 6712 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6713 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6714 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0) 6715 : 0); 6716 } 6717 6718 void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0( 6719 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6720 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6721 MIB.addImm( 6722 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0); 6723 } 6724 6725 void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm( 6726 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6727 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6728 MIB.addImm( 6729 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0); 6730 } 6731 6732 void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, 6733 const MachineInstr &MI, 6734 int OpIdx) const { 6735 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6736 MIB.addImm(MI.getOperand(OpIdx).getImm() & 6737 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL 6738 : AMDGPU::CPol::ALL_pregfx12)); 6739 } 6740 6741 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB, 6742 const MachineInstr &MI, 6743 int OpIdx) const { 6744 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6745 const bool Swizzle = MI.getOperand(OpIdx).getImm() & 6746 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ 6747 : AMDGPU::CPol::SWZ_pregfx12); 6748 MIB.addImm(Swizzle); 6749 } 6750 6751 void AMDGPUInstructionSelector::renderExtractCpolSetGLC( 6752 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6753 assert(OpIdx >= 0 && "expected to match an immediate operand"); 6754 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() & 6755 (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL 6756 : AMDGPU::CPol::ALL_pregfx12); 6757 MIB.addImm(Cpol | AMDGPU::CPol::GLC); 6758 } 6759 6760 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, 6761 const MachineInstr &MI, 6762 int OpIdx) const { 6763 MIB.addFrameIndex(MI.getOperand(1).getIndex()); 6764 } 6765 6766 void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB, 6767 const MachineInstr &MI, 6768 int OpIdx) const { 6769 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF(); 6770 int ExpVal = APF.getExactLog2Abs(); 6771 assert(ExpVal != INT_MIN); 6772 MIB.addImm(ExpVal); 6773 } 6774 6775 void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB, 6776 const MachineInstr &MI, 6777 int OpIdx) const { 6778 // "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3 6779 // "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0 6780 // "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1 6781 // "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2 6782 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4); 6783 } 6784 6785 /// Convert from 2-bit value to enum values used for op_sel* source modifiers. 6786 void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand( 6787 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { 6788 unsigned Val = MI.getOperand(OpIdx).getImm(); 6789 unsigned New = 0; 6790 if (Val & 0x1) 6791 New |= SISrcMods::OP_SEL_0; 6792 if (Val & 0x2) 6793 New |= SISrcMods::OP_SEL_1; 6794 MIB.addImm(New); 6795 } 6796 6797 bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const { 6798 return TII.isInlineConstant(Imm); 6799 } 6800 6801 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const { 6802 return TII.isInlineConstant(Imm); 6803 } 6804