1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "GCNHazardRecognizer.h" 18 #include "GCNSubtarget.h" 19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/LiveVariables.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/RegisterScavenging.h" 25 #include "llvm/CodeGen/ScheduleDAG.h" 26 #include "llvm/IR/DiagnosticInfo.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/MC/MCContext.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Target/TargetMachine.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "si-instr-info" 35 36 #define GET_INSTRINFO_CTOR_DTOR 37 #include "AMDGPUGenInstrInfo.inc" 38 39 namespace llvm { 40 41 class AAResults; 42 43 namespace AMDGPU { 44 #define GET_D16ImageDimIntrinsics_IMPL 45 #define GET_ImageDimIntrinsicTable_IMPL 46 #define GET_RsrcIntrinsics_IMPL 47 #include "AMDGPUGenSearchableTables.inc" 48 } 49 } 50 51 52 // Must be at least 4 to be able to branch over minimum unconditional branch 53 // code. This is only for making it possible to write reasonably small tests for 54 // long branches. 55 static cl::opt<unsigned> 56 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 57 cl::desc("Restrict range of branch instructions (DEBUG)")); 58 59 static cl::opt<bool> Fix16BitCopies( 60 "amdgpu-fix-16-bit-physreg-copies", 61 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), 62 cl::init(true), 63 cl::ReallyHidden); 64 65 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 66 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 67 RI(ST), ST(ST) { 68 SchedModel.init(&ST); 69 } 70 71 //===----------------------------------------------------------------------===// 72 // TargetInstrInfo callbacks 73 //===----------------------------------------------------------------------===// 74 75 static unsigned getNumOperandsNoGlue(SDNode *Node) { 76 unsigned N = Node->getNumOperands(); 77 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 78 --N; 79 return N; 80 } 81 82 /// Returns true if both nodes have the same value for the given 83 /// operand \p Op, or if both nodes do not have this operand. 84 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 85 unsigned Opc0 = N0->getMachineOpcode(); 86 unsigned Opc1 = N1->getMachineOpcode(); 87 88 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 89 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 90 91 if (Op0Idx == -1 && Op1Idx == -1) 92 return true; 93 94 95 if ((Op0Idx == -1 && Op1Idx != -1) || 96 (Op1Idx == -1 && Op0Idx != -1)) 97 return false; 98 99 // getNamedOperandIdx returns the index for the MachineInstr's operands, 100 // which includes the result as the first operand. We are indexing into the 101 // MachineSDNode's operands, so we need to skip the result operand to get 102 // the real index. 103 --Op0Idx; 104 --Op1Idx; 105 106 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 107 } 108 109 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, 110 AAResults *AA) const { 111 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI)) { 112 // Normally VALU use of exec would block the rematerialization, but that 113 // is OK in this case to have an implicit exec read as all VALU do. 114 // We really want all of the generic logic for this except for this. 115 116 // Another potential implicit use is mode register. The core logic of 117 // the RA will not attempt rematerialization if mode is set anywhere 118 // in the function, otherwise it is safe since mode is not changed. 119 return !MI.hasImplicitDef() && 120 MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() && 121 !MI.mayRaiseFPException(); 122 } 123 124 return false; 125 } 126 127 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { 128 // Any implicit use of exec by VALU is not a real register read. 129 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && 130 isVALU(*MO.getParent()); 131 } 132 133 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 134 int64_t &Offset0, 135 int64_t &Offset1) const { 136 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 137 return false; 138 139 unsigned Opc0 = Load0->getMachineOpcode(); 140 unsigned Opc1 = Load1->getMachineOpcode(); 141 142 // Make sure both are actually loads. 143 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 144 return false; 145 146 if (isDS(Opc0) && isDS(Opc1)) { 147 148 // FIXME: Handle this case: 149 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 150 return false; 151 152 // Check base reg. 153 if (Load0->getOperand(0) != Load1->getOperand(0)) 154 return false; 155 156 // Skip read2 / write2 variants for simplicity. 157 // TODO: We should report true if the used offsets are adjacent (excluded 158 // st64 versions). 159 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 160 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 161 if (Offset0Idx == -1 || Offset1Idx == -1) 162 return false; 163 164 // XXX - be careful of datalesss loads 165 // getNamedOperandIdx returns the index for MachineInstrs. Since they 166 // include the output in the operand list, but SDNodes don't, we need to 167 // subtract the index by one. 168 Offset0Idx -= get(Opc0).NumDefs; 169 Offset1Idx -= get(Opc1).NumDefs; 170 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 171 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 172 return true; 173 } 174 175 if (isSMRD(Opc0) && isSMRD(Opc1)) { 176 // Skip time and cache invalidation instructions. 177 if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || 178 AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) 179 return false; 180 181 assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); 182 183 // Check base reg. 184 if (Load0->getOperand(0) != Load1->getOperand(0)) 185 return false; 186 187 const ConstantSDNode *Load0Offset = 188 dyn_cast<ConstantSDNode>(Load0->getOperand(1)); 189 const ConstantSDNode *Load1Offset = 190 dyn_cast<ConstantSDNode>(Load1->getOperand(1)); 191 192 if (!Load0Offset || !Load1Offset) 193 return false; 194 195 Offset0 = Load0Offset->getZExtValue(); 196 Offset1 = Load1Offset->getZExtValue(); 197 return true; 198 } 199 200 // MUBUF and MTBUF can access the same addresses. 201 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 202 203 // MUBUF and MTBUF have vaddr at different indices. 204 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 205 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 206 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 207 return false; 208 209 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 210 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 211 212 if (OffIdx0 == -1 || OffIdx1 == -1) 213 return false; 214 215 // getNamedOperandIdx returns the index for MachineInstrs. Since they 216 // include the output in the operand list, but SDNodes don't, we need to 217 // subtract the index by one. 218 OffIdx0 -= get(Opc0).NumDefs; 219 OffIdx1 -= get(Opc1).NumDefs; 220 221 SDValue Off0 = Load0->getOperand(OffIdx0); 222 SDValue Off1 = Load1->getOperand(OffIdx1); 223 224 // The offset might be a FrameIndexSDNode. 225 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 226 return false; 227 228 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 229 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 230 return true; 231 } 232 233 return false; 234 } 235 236 static bool isStride64(unsigned Opc) { 237 switch (Opc) { 238 case AMDGPU::DS_READ2ST64_B32: 239 case AMDGPU::DS_READ2ST64_B64: 240 case AMDGPU::DS_WRITE2ST64_B32: 241 case AMDGPU::DS_WRITE2ST64_B64: 242 return true; 243 default: 244 return false; 245 } 246 } 247 248 bool SIInstrInfo::getMemOperandsWithOffsetWidth( 249 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 250 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 251 const TargetRegisterInfo *TRI) const { 252 if (!LdSt.mayLoadOrStore()) 253 return false; 254 255 unsigned Opc = LdSt.getOpcode(); 256 OffsetIsScalable = false; 257 const MachineOperand *BaseOp, *OffsetOp; 258 int DataOpIdx; 259 260 if (isDS(LdSt)) { 261 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 262 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 263 if (OffsetOp) { 264 // Normal, single offset LDS instruction. 265 if (!BaseOp) { 266 // DS_CONSUME/DS_APPEND use M0 for the base address. 267 // TODO: find the implicit use operand for M0 and use that as BaseOp? 268 return false; 269 } 270 BaseOps.push_back(BaseOp); 271 Offset = OffsetOp->getImm(); 272 // Get appropriate operand, and compute width accordingly. 273 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 274 if (DataOpIdx == -1) 275 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 276 Width = getOpSize(LdSt, DataOpIdx); 277 } else { 278 // The 2 offset instructions use offset0 and offset1 instead. We can treat 279 // these as a load with a single offset if the 2 offsets are consecutive. 280 // We will use this for some partially aligned loads. 281 const MachineOperand *Offset0Op = 282 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 283 const MachineOperand *Offset1Op = 284 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 285 286 unsigned Offset0 = Offset0Op->getImm(); 287 unsigned Offset1 = Offset1Op->getImm(); 288 if (Offset0 + 1 != Offset1) 289 return false; 290 291 // Each of these offsets is in element sized units, so we need to convert 292 // to bytes of the individual reads. 293 294 unsigned EltSize; 295 if (LdSt.mayLoad()) 296 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 297 else { 298 assert(LdSt.mayStore()); 299 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 300 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 301 } 302 303 if (isStride64(Opc)) 304 EltSize *= 64; 305 306 BaseOps.push_back(BaseOp); 307 Offset = EltSize * Offset0; 308 // Get appropriate operand(s), and compute width accordingly. 309 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 310 if (DataOpIdx == -1) { 311 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 312 Width = getOpSize(LdSt, DataOpIdx); 313 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); 314 Width += getOpSize(LdSt, DataOpIdx); 315 } else { 316 Width = getOpSize(LdSt, DataOpIdx); 317 } 318 } 319 return true; 320 } 321 322 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 323 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 324 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL 325 return false; 326 BaseOps.push_back(RSrc); 327 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 328 if (BaseOp && !BaseOp->isFI()) 329 BaseOps.push_back(BaseOp); 330 const MachineOperand *OffsetImm = 331 getNamedOperand(LdSt, AMDGPU::OpName::offset); 332 Offset = OffsetImm->getImm(); 333 const MachineOperand *SOffset = 334 getNamedOperand(LdSt, AMDGPU::OpName::soffset); 335 if (SOffset) { 336 if (SOffset->isReg()) 337 BaseOps.push_back(SOffset); 338 else 339 Offset += SOffset->getImm(); 340 } 341 // Get appropriate operand, and compute width accordingly. 342 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 343 if (DataOpIdx == -1) 344 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 345 Width = getOpSize(LdSt, DataOpIdx); 346 return true; 347 } 348 349 if (isMIMG(LdSt)) { 350 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 351 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); 352 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 353 if (VAddr0Idx >= 0) { 354 // GFX10 possible NSA encoding. 355 for (int I = VAddr0Idx; I < SRsrcIdx; ++I) 356 BaseOps.push_back(&LdSt.getOperand(I)); 357 } else { 358 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); 359 } 360 Offset = 0; 361 // Get appropriate operand, and compute width accordingly. 362 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 363 Width = getOpSize(LdSt, DataOpIdx); 364 return true; 365 } 366 367 if (isSMRD(LdSt)) { 368 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 369 if (!BaseOp) // e.g. S_MEMTIME 370 return false; 371 BaseOps.push_back(BaseOp); 372 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 373 Offset = OffsetOp ? OffsetOp->getImm() : 0; 374 // Get appropriate operand, and compute width accordingly. 375 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); 376 Width = getOpSize(LdSt, DataOpIdx); 377 return true; 378 } 379 380 if (isFLAT(LdSt)) { 381 // Instructions have either vaddr or saddr or both or none. 382 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 383 if (BaseOp) 384 BaseOps.push_back(BaseOp); 385 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 386 if (BaseOp) 387 BaseOps.push_back(BaseOp); 388 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 389 // Get appropriate operand, and compute width accordingly. 390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 391 if (DataOpIdx == -1) 392 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 393 Width = getOpSize(LdSt, DataOpIdx); 394 return true; 395 } 396 397 return false; 398 } 399 400 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 401 ArrayRef<const MachineOperand *> BaseOps1, 402 const MachineInstr &MI2, 403 ArrayRef<const MachineOperand *> BaseOps2) { 404 // Only examine the first "base" operand of each instruction, on the 405 // assumption that it represents the real base address of the memory access. 406 // Other operands are typically offsets or indices from this base address. 407 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) 408 return true; 409 410 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 411 return false; 412 413 auto MO1 = *MI1.memoperands_begin(); 414 auto MO2 = *MI2.memoperands_begin(); 415 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 416 return false; 417 418 auto Base1 = MO1->getValue(); 419 auto Base2 = MO2->getValue(); 420 if (!Base1 || !Base2) 421 return false; 422 Base1 = getUnderlyingObject(Base1); 423 Base2 = getUnderlyingObject(Base2); 424 425 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 426 return false; 427 428 return Base1 == Base2; 429 } 430 431 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 432 ArrayRef<const MachineOperand *> BaseOps2, 433 unsigned NumLoads, 434 unsigned NumBytes) const { 435 // If the mem ops (to be clustered) do not have the same base ptr, then they 436 // should not be clustered 437 if (!BaseOps1.empty() && !BaseOps2.empty()) { 438 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 439 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 440 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 441 return false; 442 } else if (!BaseOps1.empty() || !BaseOps2.empty()) { 443 // If only one base op is empty, they do not have the same base ptr 444 return false; 445 } 446 447 // In order to avoid regester pressure, on an average, the number of DWORDS 448 // loaded together by all clustered mem ops should not exceed 8. This is an 449 // empirical value based on certain observations and performance related 450 // experiments. 451 // The good thing about this heuristic is - it avoids clustering of too many 452 // sub-word loads, and also avoids clustering of wide loads. Below is the 453 // brief summary of how the heuristic behaves for various `LoadSize`. 454 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops 455 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops 456 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops 457 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops 458 // (5) LoadSize >= 17: do not cluster 459 const unsigned LoadSize = NumBytes / NumLoads; 460 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; 461 return NumDWORDs <= 8; 462 } 463 464 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 465 // the first 16 loads will be interleaved with the stores, and the next 16 will 466 // be clustered as expected. It should really split into 2 16 store batches. 467 // 468 // Loads are clustered until this returns false, rather than trying to schedule 469 // groups of stores. This also means we have to deal with saying different 470 // address space loads should be clustered, and ones which might cause bank 471 // conflicts. 472 // 473 // This might be deprecated so it might not be worth that much effort to fix. 474 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 475 int64_t Offset0, int64_t Offset1, 476 unsigned NumLoads) const { 477 assert(Offset1 > Offset0 && 478 "Second offset should be larger than first offset!"); 479 // If we have less than 16 loads in a row, and the offsets are within 64 480 // bytes, then schedule together. 481 482 // A cacheline is 64 bytes (for global memory). 483 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 484 } 485 486 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 487 MachineBasicBlock::iterator MI, 488 const DebugLoc &DL, MCRegister DestReg, 489 MCRegister SrcReg, bool KillSrc, 490 const char *Msg = "illegal SGPR to VGPR copy") { 491 MachineFunction *MF = MBB.getParent(); 492 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); 493 LLVMContext &C = MF->getFunction().getContext(); 494 C.diagnose(IllegalCopy); 495 496 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 497 .addReg(SrcReg, getKillRegState(KillSrc)); 498 } 499 500 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible 501 /// to directly copy, so an intermediate VGPR needs to be used. 502 static void indirectCopyToAGPR(const SIInstrInfo &TII, 503 MachineBasicBlock &MBB, 504 MachineBasicBlock::iterator MI, 505 const DebugLoc &DL, MCRegister DestReg, 506 MCRegister SrcReg, bool KillSrc, 507 RegScavenger &RS, 508 Register ImpDefSuperReg = Register(), 509 Register ImpUseSuperReg = Register()) { 510 const SIRegisterInfo &RI = TII.getRegisterInfo(); 511 512 assert(AMDGPU::SReg_32RegClass.contains(SrcReg) || 513 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 514 515 // First try to find defining accvgpr_write to avoid temporary registers. 516 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 517 --Def; 518 if (!Def->definesRegister(SrcReg, &RI)) 519 continue; 520 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) 521 break; 522 523 MachineOperand &DefOp = Def->getOperand(1); 524 assert(DefOp.isReg() || DefOp.isImm()); 525 526 if (DefOp.isReg()) { 527 // Check that register source operand if not clobbered before MI. 528 // Immediate operands are always safe to propagate. 529 bool SafeToPropagate = true; 530 for (auto I = Def; I != MI && SafeToPropagate; ++I) 531 if (I->modifiesRegister(DefOp.getReg(), &RI)) 532 SafeToPropagate = false; 533 534 if (!SafeToPropagate) 535 break; 536 537 DefOp.setIsKill(false); 538 } 539 540 MachineInstrBuilder Builder = 541 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 542 .add(DefOp); 543 if (ImpDefSuperReg) 544 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 545 546 if (ImpUseSuperReg) { 547 Builder.addReg(ImpUseSuperReg, 548 getKillRegState(KillSrc) | RegState::Implicit); 549 } 550 551 return; 552 } 553 554 RS.enterBasicBlock(MBB); 555 RS.forward(MI); 556 557 // Ideally we want to have three registers for a long reg_sequence copy 558 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 559 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 560 *MBB.getParent()); 561 562 // Registers in the sequence are allocated contiguously so we can just 563 // use register number to pick one of three round-robin temps. 564 unsigned RegNo = DestReg % 3; 565 Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 566 if (!Tmp) 567 report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); 568 RS.setRegUsed(Tmp); 569 570 if (!TII.getSubtarget().hasGFX90AInsts()) { 571 // Only loop through if there are any free registers left, otherwise 572 // scavenger may report a fatal error without emergency spill slot 573 // or spill with the slot. 574 while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { 575 Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); 576 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 577 break; 578 Tmp = Tmp2; 579 RS.setRegUsed(Tmp); 580 } 581 } 582 583 // Insert copy to temporary VGPR. 584 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; 585 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { 586 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; 587 } else { 588 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 589 } 590 591 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) 592 .addReg(SrcReg, getKillRegState(KillSrc)); 593 if (ImpUseSuperReg) { 594 UseBuilder.addReg(ImpUseSuperReg, 595 getKillRegState(KillSrc) | RegState::Implicit); 596 } 597 598 MachineInstrBuilder DefBuilder 599 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 600 .addReg(Tmp, RegState::Kill); 601 602 if (ImpDefSuperReg) 603 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 604 } 605 606 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, 607 MachineBasicBlock::iterator MI, const DebugLoc &DL, 608 MCRegister DestReg, MCRegister SrcReg, bool KillSrc, 609 const TargetRegisterClass *RC, bool Forward) { 610 const SIRegisterInfo &RI = TII.getRegisterInfo(); 611 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); 612 MachineBasicBlock::iterator I = MI; 613 MachineInstr *FirstMI = nullptr, *LastMI = nullptr; 614 615 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { 616 int16_t SubIdx = BaseIndices[Idx]; 617 Register Reg = RI.getSubReg(DestReg, SubIdx); 618 unsigned Opcode = AMDGPU::S_MOV_B32; 619 620 // Is SGPR aligned? If so try to combine with next. 621 Register Src = RI.getSubReg(SrcReg, SubIdx); 622 bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; 623 bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; 624 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { 625 // Can use SGPR64 copy 626 unsigned Channel = RI.getChannelFromSubReg(SubIdx); 627 SubIdx = RI.getSubRegFromChannel(Channel, 2); 628 Opcode = AMDGPU::S_MOV_B64; 629 Idx++; 630 } 631 632 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) 633 .addReg(RI.getSubReg(SrcReg, SubIdx)) 634 .addReg(SrcReg, RegState::Implicit); 635 636 if (!FirstMI) 637 FirstMI = LastMI; 638 639 if (!Forward) 640 I--; 641 } 642 643 assert(FirstMI && LastMI); 644 if (!Forward) 645 std::swap(FirstMI, LastMI); 646 647 FirstMI->addOperand( 648 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); 649 650 if (KillSrc) 651 LastMI->addRegisterKilled(SrcReg, &RI); 652 } 653 654 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 655 MachineBasicBlock::iterator MI, 656 const DebugLoc &DL, MCRegister DestReg, 657 MCRegister SrcReg, bool KillSrc) const { 658 const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); 659 660 // FIXME: This is hack to resolve copies between 16 bit and 32 bit 661 // registers until all patterns are fixed. 662 if (Fix16BitCopies && 663 ((RI.getRegSizeInBits(*RC) == 16) ^ 664 (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { 665 MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; 666 MCRegister Super = RI.get32BitRegister(RegToFix); 667 assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); 668 RegToFix = Super; 669 670 if (DestReg == SrcReg) { 671 // Insert empty bundle since ExpandPostRA expects an instruction here. 672 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); 673 return; 674 } 675 676 RC = RI.getPhysRegClass(DestReg); 677 } 678 679 if (RC == &AMDGPU::VGPR_32RegClass) { 680 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 681 AMDGPU::SReg_32RegClass.contains(SrcReg) || 682 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 683 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 684 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; 685 BuildMI(MBB, MI, DL, get(Opc), DestReg) 686 .addReg(SrcReg, getKillRegState(KillSrc)); 687 return; 688 } 689 690 if (RC == &AMDGPU::SReg_32_XM0RegClass || 691 RC == &AMDGPU::SReg_32RegClass) { 692 if (SrcReg == AMDGPU::SCC) { 693 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 694 .addImm(1) 695 .addImm(0); 696 return; 697 } 698 699 if (DestReg == AMDGPU::VCC_LO) { 700 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 701 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 702 .addReg(SrcReg, getKillRegState(KillSrc)); 703 } else { 704 // FIXME: Hack until VReg_1 removed. 705 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 706 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 707 .addImm(0) 708 .addReg(SrcReg, getKillRegState(KillSrc)); 709 } 710 711 return; 712 } 713 714 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 715 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 716 return; 717 } 718 719 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 720 .addReg(SrcReg, getKillRegState(KillSrc)); 721 return; 722 } 723 724 if (RC == &AMDGPU::SReg_64RegClass) { 725 if (SrcReg == AMDGPU::SCC) { 726 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) 727 .addImm(1) 728 .addImm(0); 729 return; 730 } 731 732 if (DestReg == AMDGPU::VCC) { 733 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 734 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 735 .addReg(SrcReg, getKillRegState(KillSrc)); 736 } else { 737 // FIXME: Hack until VReg_1 removed. 738 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 739 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 740 .addImm(0) 741 .addReg(SrcReg, getKillRegState(KillSrc)); 742 } 743 744 return; 745 } 746 747 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 748 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 749 return; 750 } 751 752 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 753 .addReg(SrcReg, getKillRegState(KillSrc)); 754 return; 755 } 756 757 if (DestReg == AMDGPU::SCC) { 758 // Copying 64-bit or 32-bit sources to SCC barely makes sense, 759 // but SelectionDAG emits such copies for i1 sources. 760 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 761 // This copy can only be produced by patterns 762 // with explicit SCC, which are known to be enabled 763 // only for subtargets with S_CMP_LG_U64 present. 764 assert(ST.hasScalarCompareEq64()); 765 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) 766 .addReg(SrcReg, getKillRegState(KillSrc)) 767 .addImm(0); 768 } else { 769 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 770 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 771 .addReg(SrcReg, getKillRegState(KillSrc)) 772 .addImm(0); 773 } 774 775 return; 776 } 777 778 if (RC == &AMDGPU::AGPR_32RegClass) { 779 if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { 780 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 781 .addReg(SrcReg, getKillRegState(KillSrc)); 782 return; 783 } 784 785 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { 786 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) 787 .addReg(SrcReg, getKillRegState(KillSrc)); 788 return; 789 } 790 791 // FIXME: Pass should maintain scavenger to avoid scan through the block on 792 // every AGPR spill. 793 RegScavenger RS; 794 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); 795 return; 796 } 797 798 const unsigned Size = RI.getRegSizeInBits(*RC); 799 if (Size == 16) { 800 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 801 AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || 802 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 803 AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); 804 805 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); 806 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); 807 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); 808 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 809 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || 810 AMDGPU::SReg_LO16RegClass.contains(DestReg) || 811 AMDGPU::AGPR_LO16RegClass.contains(DestReg); 812 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 813 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 814 AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 815 MCRegister NewDestReg = RI.get32BitRegister(DestReg); 816 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); 817 818 if (IsSGPRDst) { 819 if (!IsSGPRSrc) { 820 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 821 return; 822 } 823 824 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) 825 .addReg(NewSrcReg, getKillRegState(KillSrc)); 826 return; 827 } 828 829 if (IsAGPRDst || IsAGPRSrc) { 830 if (!DstLow || !SrcLow) { 831 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 832 "Cannot use hi16 subreg with an AGPR!"); 833 } 834 835 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); 836 return; 837 } 838 839 if (IsSGPRSrc && !ST.hasSDWAScalar()) { 840 if (!DstLow || !SrcLow) { 841 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 842 "Cannot use hi16 subreg on VI!"); 843 } 844 845 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) 846 .addReg(NewSrcReg, getKillRegState(KillSrc)); 847 return; 848 } 849 850 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) 851 .addImm(0) // src0_modifiers 852 .addReg(NewSrcReg) 853 .addImm(0) // clamp 854 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 855 : AMDGPU::SDWA::SdwaSel::WORD_1) 856 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) 857 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 858 : AMDGPU::SDWA::SdwaSel::WORD_1) 859 .addReg(NewDestReg, RegState::Implicit | RegState::Undef); 860 // First implicit operand is $exec. 861 MIB->tieOperands(0, MIB->getNumOperands() - 1); 862 return; 863 } 864 865 const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); 866 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { 867 if (ST.hasPackedFP32Ops()) { 868 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) 869 .addImm(SISrcMods::OP_SEL_1) 870 .addReg(SrcReg) 871 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 872 .addReg(SrcReg) 873 .addImm(0) // op_sel_lo 874 .addImm(0) // op_sel_hi 875 .addImm(0) // neg_lo 876 .addImm(0) // neg_hi 877 .addImm(0) // clamp 878 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 879 return; 880 } 881 } 882 883 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 884 if (RI.isSGPRClass(RC)) { 885 if (!RI.isSGPRClass(SrcRC)) { 886 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 887 return; 888 } 889 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward); 890 return; 891 } 892 893 unsigned EltSize = 4; 894 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 895 if (RI.hasAGPRs(RC)) { 896 Opcode = (RI.hasVGPRs(SrcRC)) ? 897 AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 898 } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) { 899 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; 900 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && 901 (RI.isProperlyAlignedRC(*RC) && 902 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { 903 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. 904 if (ST.hasPackedFP32Ops()) { 905 Opcode = AMDGPU::V_PK_MOV_B32; 906 EltSize = 8; 907 } 908 } 909 910 // For the cases where we need an intermediate instruction/temporary register 911 // (destination is an AGPR), we need a scavenger. 912 // 913 // FIXME: The pass should maintain this for us so we don't have to re-scan the 914 // whole block for every handled copy. 915 std::unique_ptr<RegScavenger> RS; 916 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) 917 RS.reset(new RegScavenger()); 918 919 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 920 921 // If there is an overlap, we can't kill the super-register on the last 922 // instruction, since it will also kill the components made live by this def. 923 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); 924 925 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 926 unsigned SubIdx; 927 if (Forward) 928 SubIdx = SubIndices[Idx]; 929 else 930 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 931 932 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; 933 934 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { 935 Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); 936 Register ImpUseSuper = SrcReg; 937 indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 938 RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, 939 ImpDefSuper, ImpUseSuper); 940 } else if (Opcode == AMDGPU::V_PK_MOV_B32) { 941 Register DstSubReg = RI.getSubReg(DestReg, SubIdx); 942 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); 943 MachineInstrBuilder MIB = 944 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) 945 .addImm(SISrcMods::OP_SEL_1) 946 .addReg(SrcSubReg) 947 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 948 .addReg(SrcSubReg) 949 .addImm(0) // op_sel_lo 950 .addImm(0) // op_sel_hi 951 .addImm(0) // neg_lo 952 .addImm(0) // neg_hi 953 .addImm(0) // clamp 954 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 955 if (Idx == 0) 956 MIB.addReg(DestReg, RegState::Define | RegState::Implicit); 957 } else { 958 MachineInstrBuilder Builder = 959 BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) 960 .addReg(RI.getSubReg(SrcReg, SubIdx)); 961 if (Idx == 0) 962 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 963 964 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 965 } 966 } 967 } 968 969 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 970 int NewOpc; 971 972 // Try to map original to commuted opcode 973 NewOpc = AMDGPU::getCommuteRev(Opcode); 974 if (NewOpc != -1) 975 // Check if the commuted (REV) opcode exists on the target. 976 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 977 978 // Try to map commuted to original opcode 979 NewOpc = AMDGPU::getCommuteOrig(Opcode); 980 if (NewOpc != -1) 981 // Check if the original (non-REV) opcode exists on the target. 982 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 983 984 return Opcode; 985 } 986 987 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 988 MachineBasicBlock::iterator MI, 989 const DebugLoc &DL, unsigned DestReg, 990 int64_t Value) const { 991 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 992 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 993 if (RegClass == &AMDGPU::SReg_32RegClass || 994 RegClass == &AMDGPU::SGPR_32RegClass || 995 RegClass == &AMDGPU::SReg_32_XM0RegClass || 996 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 997 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 998 .addImm(Value); 999 return; 1000 } 1001 1002 if (RegClass == &AMDGPU::SReg_64RegClass || 1003 RegClass == &AMDGPU::SGPR_64RegClass || 1004 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 1005 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 1006 .addImm(Value); 1007 return; 1008 } 1009 1010 if (RegClass == &AMDGPU::VGPR_32RegClass) { 1011 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 1012 .addImm(Value); 1013 return; 1014 } 1015 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { 1016 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 1017 .addImm(Value); 1018 return; 1019 } 1020 1021 unsigned EltSize = 4; 1022 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1023 if (RI.isSGPRClass(RegClass)) { 1024 if (RI.getRegSizeInBits(*RegClass) > 32) { 1025 Opcode = AMDGPU::S_MOV_B64; 1026 EltSize = 8; 1027 } else { 1028 Opcode = AMDGPU::S_MOV_B32; 1029 EltSize = 4; 1030 } 1031 } 1032 1033 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 1034 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 1035 int64_t IdxValue = Idx == 0 ? Value : 0; 1036 1037 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 1038 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 1039 Builder.addImm(IdxValue); 1040 } 1041 } 1042 1043 const TargetRegisterClass * 1044 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 1045 return &AMDGPU::VGPR_32RegClass; 1046 } 1047 1048 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 1049 MachineBasicBlock::iterator I, 1050 const DebugLoc &DL, Register DstReg, 1051 ArrayRef<MachineOperand> Cond, 1052 Register TrueReg, 1053 Register FalseReg) const { 1054 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1055 const TargetRegisterClass *BoolXExecRC = 1056 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1057 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 1058 "Not a VGPR32 reg"); 1059 1060 if (Cond.size() == 1) { 1061 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1062 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1063 .add(Cond[0]); 1064 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1065 .addImm(0) 1066 .addReg(FalseReg) 1067 .addImm(0) 1068 .addReg(TrueReg) 1069 .addReg(SReg); 1070 } else if (Cond.size() == 2) { 1071 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 1072 switch (Cond[0].getImm()) { 1073 case SIInstrInfo::SCC_TRUE: { 1074 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1075 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1076 : AMDGPU::S_CSELECT_B64), SReg) 1077 .addImm(1) 1078 .addImm(0); 1079 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1080 .addImm(0) 1081 .addReg(FalseReg) 1082 .addImm(0) 1083 .addReg(TrueReg) 1084 .addReg(SReg); 1085 break; 1086 } 1087 case SIInstrInfo::SCC_FALSE: { 1088 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1089 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1090 : AMDGPU::S_CSELECT_B64), SReg) 1091 .addImm(0) 1092 .addImm(1); 1093 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1094 .addImm(0) 1095 .addReg(FalseReg) 1096 .addImm(0) 1097 .addReg(TrueReg) 1098 .addReg(SReg); 1099 break; 1100 } 1101 case SIInstrInfo::VCCNZ: { 1102 MachineOperand RegOp = Cond[1]; 1103 RegOp.setImplicit(false); 1104 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1105 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1106 .add(RegOp); 1107 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1108 .addImm(0) 1109 .addReg(FalseReg) 1110 .addImm(0) 1111 .addReg(TrueReg) 1112 .addReg(SReg); 1113 break; 1114 } 1115 case SIInstrInfo::VCCZ: { 1116 MachineOperand RegOp = Cond[1]; 1117 RegOp.setImplicit(false); 1118 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1119 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1120 .add(RegOp); 1121 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1122 .addImm(0) 1123 .addReg(TrueReg) 1124 .addImm(0) 1125 .addReg(FalseReg) 1126 .addReg(SReg); 1127 break; 1128 } 1129 case SIInstrInfo::EXECNZ: { 1130 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1131 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1132 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1133 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1134 .addImm(0); 1135 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1136 : AMDGPU::S_CSELECT_B64), SReg) 1137 .addImm(1) 1138 .addImm(0); 1139 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1140 .addImm(0) 1141 .addReg(FalseReg) 1142 .addImm(0) 1143 .addReg(TrueReg) 1144 .addReg(SReg); 1145 break; 1146 } 1147 case SIInstrInfo::EXECZ: { 1148 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1149 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1150 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1151 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1152 .addImm(0); 1153 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1154 : AMDGPU::S_CSELECT_B64), SReg) 1155 .addImm(0) 1156 .addImm(1); 1157 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1158 .addImm(0) 1159 .addReg(FalseReg) 1160 .addImm(0) 1161 .addReg(TrueReg) 1162 .addReg(SReg); 1163 llvm_unreachable("Unhandled branch predicate EXECZ"); 1164 break; 1165 } 1166 default: 1167 llvm_unreachable("invalid branch predicate"); 1168 } 1169 } else { 1170 llvm_unreachable("Can only handle Cond size 1 or 2"); 1171 } 1172 } 1173 1174 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 1175 MachineBasicBlock::iterator I, 1176 const DebugLoc &DL, 1177 Register SrcReg, int Value) const { 1178 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1179 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1180 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 1181 .addImm(Value) 1182 .addReg(SrcReg); 1183 1184 return Reg; 1185 } 1186 1187 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 1188 MachineBasicBlock::iterator I, 1189 const DebugLoc &DL, 1190 Register SrcReg, int Value) const { 1191 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1192 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1193 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 1194 .addImm(Value) 1195 .addReg(SrcReg); 1196 1197 return Reg; 1198 } 1199 1200 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 1201 1202 if (RI.hasAGPRs(DstRC)) 1203 return AMDGPU::COPY; 1204 if (RI.getRegSizeInBits(*DstRC) == 32) { 1205 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1206 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 1207 return AMDGPU::S_MOV_B64; 1208 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 1209 return AMDGPU::V_MOV_B64_PSEUDO; 1210 } 1211 return AMDGPU::COPY; 1212 } 1213 1214 const MCInstrDesc & 1215 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, 1216 bool IsIndirectSrc) const { 1217 if (IsIndirectSrc) { 1218 if (VecSize <= 32) // 4 bytes 1219 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); 1220 if (VecSize <= 64) // 8 bytes 1221 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); 1222 if (VecSize <= 96) // 12 bytes 1223 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); 1224 if (VecSize <= 128) // 16 bytes 1225 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); 1226 if (VecSize <= 160) // 20 bytes 1227 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); 1228 if (VecSize <= 256) // 32 bytes 1229 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); 1230 if (VecSize <= 512) // 64 bytes 1231 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); 1232 if (VecSize <= 1024) // 128 bytes 1233 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); 1234 1235 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos"); 1236 } 1237 1238 if (VecSize <= 32) // 4 bytes 1239 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); 1240 if (VecSize <= 64) // 8 bytes 1241 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); 1242 if (VecSize <= 96) // 12 bytes 1243 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); 1244 if (VecSize <= 128) // 16 bytes 1245 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); 1246 if (VecSize <= 160) // 20 bytes 1247 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); 1248 if (VecSize <= 256) // 32 bytes 1249 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); 1250 if (VecSize <= 512) // 64 bytes 1251 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); 1252 if (VecSize <= 1024) // 128 bytes 1253 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); 1254 1255 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos"); 1256 } 1257 1258 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { 1259 if (VecSize <= 32) // 4 bytes 1260 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; 1261 if (VecSize <= 64) // 8 bytes 1262 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1263 if (VecSize <= 96) // 12 bytes 1264 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; 1265 if (VecSize <= 128) // 16 bytes 1266 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1267 if (VecSize <= 160) // 20 bytes 1268 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; 1269 if (VecSize <= 256) // 32 bytes 1270 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; 1271 if (VecSize <= 512) // 64 bytes 1272 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; 1273 if (VecSize <= 1024) // 128 bytes 1274 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; 1275 1276 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1277 } 1278 1279 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { 1280 if (VecSize <= 32) // 4 bytes 1281 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; 1282 if (VecSize <= 64) // 8 bytes 1283 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1284 if (VecSize <= 96) // 12 bytes 1285 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; 1286 if (VecSize <= 128) // 16 bytes 1287 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1288 if (VecSize <= 160) // 20 bytes 1289 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; 1290 if (VecSize <= 256) // 32 bytes 1291 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; 1292 if (VecSize <= 512) // 64 bytes 1293 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; 1294 if (VecSize <= 1024) // 128 bytes 1295 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; 1296 1297 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1298 } 1299 1300 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { 1301 if (VecSize <= 64) // 8 bytes 1302 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; 1303 if (VecSize <= 128) // 16 bytes 1304 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; 1305 if (VecSize <= 256) // 32 bytes 1306 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; 1307 if (VecSize <= 512) // 64 bytes 1308 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; 1309 if (VecSize <= 1024) // 128 bytes 1310 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; 1311 1312 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1313 } 1314 1315 const MCInstrDesc & 1316 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, 1317 bool IsSGPR) const { 1318 if (IsSGPR) { 1319 switch (EltSize) { 1320 case 32: 1321 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); 1322 case 64: 1323 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); 1324 default: 1325 llvm_unreachable("invalid reg indexing elt size"); 1326 } 1327 } 1328 1329 assert(EltSize == 32 && "invalid reg indexing elt size"); 1330 return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); 1331 } 1332 1333 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 1334 switch (Size) { 1335 case 4: 1336 return AMDGPU::SI_SPILL_S32_SAVE; 1337 case 8: 1338 return AMDGPU::SI_SPILL_S64_SAVE; 1339 case 12: 1340 return AMDGPU::SI_SPILL_S96_SAVE; 1341 case 16: 1342 return AMDGPU::SI_SPILL_S128_SAVE; 1343 case 20: 1344 return AMDGPU::SI_SPILL_S160_SAVE; 1345 case 24: 1346 return AMDGPU::SI_SPILL_S192_SAVE; 1347 case 28: 1348 return AMDGPU::SI_SPILL_S224_SAVE; 1349 case 32: 1350 return AMDGPU::SI_SPILL_S256_SAVE; 1351 case 64: 1352 return AMDGPU::SI_SPILL_S512_SAVE; 1353 case 128: 1354 return AMDGPU::SI_SPILL_S1024_SAVE; 1355 default: 1356 llvm_unreachable("unknown register size"); 1357 } 1358 } 1359 1360 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1361 switch (Size) { 1362 case 4: 1363 return AMDGPU::SI_SPILL_V32_SAVE; 1364 case 8: 1365 return AMDGPU::SI_SPILL_V64_SAVE; 1366 case 12: 1367 return AMDGPU::SI_SPILL_V96_SAVE; 1368 case 16: 1369 return AMDGPU::SI_SPILL_V128_SAVE; 1370 case 20: 1371 return AMDGPU::SI_SPILL_V160_SAVE; 1372 case 24: 1373 return AMDGPU::SI_SPILL_V192_SAVE; 1374 case 28: 1375 return AMDGPU::SI_SPILL_V224_SAVE; 1376 case 32: 1377 return AMDGPU::SI_SPILL_V256_SAVE; 1378 case 64: 1379 return AMDGPU::SI_SPILL_V512_SAVE; 1380 case 128: 1381 return AMDGPU::SI_SPILL_V1024_SAVE; 1382 default: 1383 llvm_unreachable("unknown register size"); 1384 } 1385 } 1386 1387 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1388 switch (Size) { 1389 case 4: 1390 return AMDGPU::SI_SPILL_A32_SAVE; 1391 case 8: 1392 return AMDGPU::SI_SPILL_A64_SAVE; 1393 case 12: 1394 return AMDGPU::SI_SPILL_A96_SAVE; 1395 case 16: 1396 return AMDGPU::SI_SPILL_A128_SAVE; 1397 case 20: 1398 return AMDGPU::SI_SPILL_A160_SAVE; 1399 case 24: 1400 return AMDGPU::SI_SPILL_A192_SAVE; 1401 case 28: 1402 return AMDGPU::SI_SPILL_A224_SAVE; 1403 case 32: 1404 return AMDGPU::SI_SPILL_A256_SAVE; 1405 case 64: 1406 return AMDGPU::SI_SPILL_A512_SAVE; 1407 case 128: 1408 return AMDGPU::SI_SPILL_A1024_SAVE; 1409 default: 1410 llvm_unreachable("unknown register size"); 1411 } 1412 } 1413 1414 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 1415 MachineBasicBlock::iterator MI, 1416 Register SrcReg, bool isKill, 1417 int FrameIndex, 1418 const TargetRegisterClass *RC, 1419 const TargetRegisterInfo *TRI) const { 1420 MachineFunction *MF = MBB.getParent(); 1421 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1422 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1423 const DebugLoc &DL = MBB.findDebugLoc(MI); 1424 1425 MachinePointerInfo PtrInfo 1426 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1427 MachineMemOperand *MMO = MF->getMachineMemOperand( 1428 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 1429 FrameInfo.getObjectAlign(FrameIndex)); 1430 unsigned SpillSize = TRI->getSpillSize(*RC); 1431 1432 if (RI.isSGPRClass(RC)) { 1433 MFI->setHasSpilledSGPRs(); 1434 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1435 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && 1436 SrcReg != AMDGPU::EXEC && "exec should not be spilled"); 1437 1438 // We are only allowed to create one new instruction when spilling 1439 // registers, so we need to use pseudo instruction for spilling SGPRs. 1440 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1441 1442 // The SGPR spill/restore instructions only work on number sgprs, so we need 1443 // to make sure we are using the correct register class. 1444 if (SrcReg.isVirtual() && SpillSize == 4) { 1445 MachineRegisterInfo &MRI = MF->getRegInfo(); 1446 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1447 } 1448 1449 BuildMI(MBB, MI, DL, OpDesc) 1450 .addReg(SrcReg, getKillRegState(isKill)) // data 1451 .addFrameIndex(FrameIndex) // addr 1452 .addMemOperand(MMO) 1453 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1454 1455 if (RI.spillSGPRToVGPR()) 1456 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1457 return; 1458 } 1459 1460 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize) 1461 : getVGPRSpillSaveOpcode(SpillSize); 1462 MFI->setHasSpilledVGPRs(); 1463 1464 BuildMI(MBB, MI, DL, get(Opcode)) 1465 .addReg(SrcReg, getKillRegState(isKill)) // data 1466 .addFrameIndex(FrameIndex) // addr 1467 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1468 .addImm(0) // offset 1469 .addMemOperand(MMO); 1470 } 1471 1472 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1473 switch (Size) { 1474 case 4: 1475 return AMDGPU::SI_SPILL_S32_RESTORE; 1476 case 8: 1477 return AMDGPU::SI_SPILL_S64_RESTORE; 1478 case 12: 1479 return AMDGPU::SI_SPILL_S96_RESTORE; 1480 case 16: 1481 return AMDGPU::SI_SPILL_S128_RESTORE; 1482 case 20: 1483 return AMDGPU::SI_SPILL_S160_RESTORE; 1484 case 24: 1485 return AMDGPU::SI_SPILL_S192_RESTORE; 1486 case 28: 1487 return AMDGPU::SI_SPILL_S224_RESTORE; 1488 case 32: 1489 return AMDGPU::SI_SPILL_S256_RESTORE; 1490 case 64: 1491 return AMDGPU::SI_SPILL_S512_RESTORE; 1492 case 128: 1493 return AMDGPU::SI_SPILL_S1024_RESTORE; 1494 default: 1495 llvm_unreachable("unknown register size"); 1496 } 1497 } 1498 1499 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1500 switch (Size) { 1501 case 4: 1502 return AMDGPU::SI_SPILL_V32_RESTORE; 1503 case 8: 1504 return AMDGPU::SI_SPILL_V64_RESTORE; 1505 case 12: 1506 return AMDGPU::SI_SPILL_V96_RESTORE; 1507 case 16: 1508 return AMDGPU::SI_SPILL_V128_RESTORE; 1509 case 20: 1510 return AMDGPU::SI_SPILL_V160_RESTORE; 1511 case 24: 1512 return AMDGPU::SI_SPILL_V192_RESTORE; 1513 case 28: 1514 return AMDGPU::SI_SPILL_V224_RESTORE; 1515 case 32: 1516 return AMDGPU::SI_SPILL_V256_RESTORE; 1517 case 64: 1518 return AMDGPU::SI_SPILL_V512_RESTORE; 1519 case 128: 1520 return AMDGPU::SI_SPILL_V1024_RESTORE; 1521 default: 1522 llvm_unreachable("unknown register size"); 1523 } 1524 } 1525 1526 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1527 switch (Size) { 1528 case 4: 1529 return AMDGPU::SI_SPILL_A32_RESTORE; 1530 case 8: 1531 return AMDGPU::SI_SPILL_A64_RESTORE; 1532 case 12: 1533 return AMDGPU::SI_SPILL_A96_RESTORE; 1534 case 16: 1535 return AMDGPU::SI_SPILL_A128_RESTORE; 1536 case 20: 1537 return AMDGPU::SI_SPILL_A160_RESTORE; 1538 case 24: 1539 return AMDGPU::SI_SPILL_A192_RESTORE; 1540 case 28: 1541 return AMDGPU::SI_SPILL_A224_RESTORE; 1542 case 32: 1543 return AMDGPU::SI_SPILL_A256_RESTORE; 1544 case 64: 1545 return AMDGPU::SI_SPILL_A512_RESTORE; 1546 case 128: 1547 return AMDGPU::SI_SPILL_A1024_RESTORE; 1548 default: 1549 llvm_unreachable("unknown register size"); 1550 } 1551 } 1552 1553 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1554 MachineBasicBlock::iterator MI, 1555 Register DestReg, int FrameIndex, 1556 const TargetRegisterClass *RC, 1557 const TargetRegisterInfo *TRI) const { 1558 MachineFunction *MF = MBB.getParent(); 1559 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1560 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1561 const DebugLoc &DL = MBB.findDebugLoc(MI); 1562 unsigned SpillSize = TRI->getSpillSize(*RC); 1563 1564 MachinePointerInfo PtrInfo 1565 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1566 1567 MachineMemOperand *MMO = MF->getMachineMemOperand( 1568 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 1569 FrameInfo.getObjectAlign(FrameIndex)); 1570 1571 if (RI.isSGPRClass(RC)) { 1572 MFI->setHasSpilledSGPRs(); 1573 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1574 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && 1575 DestReg != AMDGPU::EXEC && "exec should not be spilled"); 1576 1577 // FIXME: Maybe this should not include a memoperand because it will be 1578 // lowered to non-memory instructions. 1579 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1580 if (DestReg.isVirtual() && SpillSize == 4) { 1581 MachineRegisterInfo &MRI = MF->getRegInfo(); 1582 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1583 } 1584 1585 if (RI.spillSGPRToVGPR()) 1586 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1587 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1588 .addFrameIndex(FrameIndex) // addr 1589 .addMemOperand(MMO) 1590 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1591 1592 return; 1593 } 1594 1595 unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize) 1596 : getVGPRSpillRestoreOpcode(SpillSize); 1597 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 1598 .addFrameIndex(FrameIndex) // vaddr 1599 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1600 .addImm(0) // offset 1601 .addMemOperand(MMO); 1602 } 1603 1604 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1605 MachineBasicBlock::iterator MI) const { 1606 insertNoops(MBB, MI, 1); 1607 } 1608 1609 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, 1610 MachineBasicBlock::iterator MI, 1611 unsigned Quantity) const { 1612 DebugLoc DL = MBB.findDebugLoc(MI); 1613 while (Quantity > 0) { 1614 unsigned Arg = std::min(Quantity, 8u); 1615 Quantity -= Arg; 1616 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); 1617 } 1618 } 1619 1620 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1621 auto MF = MBB.getParent(); 1622 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1623 1624 assert(Info->isEntryFunction()); 1625 1626 if (MBB.succ_empty()) { 1627 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1628 if (HasNoTerminator) { 1629 if (Info->returnsVoid()) { 1630 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1631 } else { 1632 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1633 } 1634 } 1635 } 1636 } 1637 1638 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1639 switch (MI.getOpcode()) { 1640 default: return 1; // FIXME: Do wait states equal cycles? 1641 1642 case AMDGPU::S_NOP: 1643 return MI.getOperand(0).getImm() + 1; 1644 } 1645 } 1646 1647 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1648 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1649 MachineBasicBlock &MBB = *MI.getParent(); 1650 DebugLoc DL = MBB.findDebugLoc(MI); 1651 switch (MI.getOpcode()) { 1652 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1653 case AMDGPU::S_MOV_B64_term: 1654 // This is only a terminator to get the correct spill code placement during 1655 // register allocation. 1656 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1657 break; 1658 1659 case AMDGPU::S_MOV_B32_term: 1660 // This is only a terminator to get the correct spill code placement during 1661 // register allocation. 1662 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1663 break; 1664 1665 case AMDGPU::S_XOR_B64_term: 1666 // This is only a terminator to get the correct spill code placement during 1667 // register allocation. 1668 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1669 break; 1670 1671 case AMDGPU::S_XOR_B32_term: 1672 // This is only a terminator to get the correct spill code placement during 1673 // register allocation. 1674 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1675 break; 1676 case AMDGPU::S_OR_B64_term: 1677 // This is only a terminator to get the correct spill code placement during 1678 // register allocation. 1679 MI.setDesc(get(AMDGPU::S_OR_B64)); 1680 break; 1681 case AMDGPU::S_OR_B32_term: 1682 // This is only a terminator to get the correct spill code placement during 1683 // register allocation. 1684 MI.setDesc(get(AMDGPU::S_OR_B32)); 1685 break; 1686 1687 case AMDGPU::S_ANDN2_B64_term: 1688 // This is only a terminator to get the correct spill code placement during 1689 // register allocation. 1690 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1691 break; 1692 1693 case AMDGPU::S_ANDN2_B32_term: 1694 // This is only a terminator to get the correct spill code placement during 1695 // register allocation. 1696 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1697 break; 1698 1699 case AMDGPU::S_AND_B64_term: 1700 // This is only a terminator to get the correct spill code placement during 1701 // register allocation. 1702 MI.setDesc(get(AMDGPU::S_AND_B64)); 1703 break; 1704 1705 case AMDGPU::S_AND_B32_term: 1706 // This is only a terminator to get the correct spill code placement during 1707 // register allocation. 1708 MI.setDesc(get(AMDGPU::S_AND_B32)); 1709 break; 1710 1711 case AMDGPU::V_MOV_B64_PSEUDO: { 1712 Register Dst = MI.getOperand(0).getReg(); 1713 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1714 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1715 1716 const MachineOperand &SrcOp = MI.getOperand(1); 1717 // FIXME: Will this work for 64-bit floating point immediates? 1718 assert(!SrcOp.isFPImm()); 1719 if (SrcOp.isImm()) { 1720 APInt Imm(64, SrcOp.getImm()); 1721 APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 1722 APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 1723 if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { 1724 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 1725 .addImm(SISrcMods::OP_SEL_1) 1726 .addImm(Lo.getSExtValue()) 1727 .addImm(SISrcMods::OP_SEL_1) 1728 .addImm(Lo.getSExtValue()) 1729 .addImm(0) // op_sel_lo 1730 .addImm(0) // op_sel_hi 1731 .addImm(0) // neg_lo 1732 .addImm(0) // neg_hi 1733 .addImm(0); // clamp 1734 } else { 1735 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1736 .addImm(Lo.getSExtValue()) 1737 .addReg(Dst, RegState::Implicit | RegState::Define); 1738 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1739 .addImm(Hi.getSExtValue()) 1740 .addReg(Dst, RegState::Implicit | RegState::Define); 1741 } 1742 } else { 1743 assert(SrcOp.isReg()); 1744 if (ST.hasPackedFP32Ops() && 1745 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { 1746 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 1747 .addImm(SISrcMods::OP_SEL_1) // src0_mod 1748 .addReg(SrcOp.getReg()) 1749 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod 1750 .addReg(SrcOp.getReg()) 1751 .addImm(0) // op_sel_lo 1752 .addImm(0) // op_sel_hi 1753 .addImm(0) // neg_lo 1754 .addImm(0) // neg_hi 1755 .addImm(0); // clamp 1756 } else { 1757 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 1758 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 1759 .addReg(Dst, RegState::Implicit | RegState::Define); 1760 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 1761 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 1762 .addReg(Dst, RegState::Implicit | RegState::Define); 1763 } 1764 } 1765 MI.eraseFromParent(); 1766 break; 1767 } 1768 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 1769 expandMovDPP64(MI); 1770 break; 1771 } 1772 case AMDGPU::S_MOV_B64_IMM_PSEUDO: { 1773 const MachineOperand &SrcOp = MI.getOperand(1); 1774 assert(!SrcOp.isFPImm()); 1775 APInt Imm(64, SrcOp.getImm()); 1776 if (Imm.isIntN(32) || isInlineConstant(Imm)) { 1777 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1778 break; 1779 } 1780 1781 Register Dst = MI.getOperand(0).getReg(); 1782 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 1783 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 1784 1785 APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 1786 APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 1787 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) 1788 .addImm(Lo.getSExtValue()) 1789 .addReg(Dst, RegState::Implicit | RegState::Define); 1790 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) 1791 .addImm(Hi.getSExtValue()) 1792 .addReg(Dst, RegState::Implicit | RegState::Define); 1793 MI.eraseFromParent(); 1794 break; 1795 } 1796 case AMDGPU::V_SET_INACTIVE_B32: { 1797 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1798 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1799 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 1800 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 1801 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 1802 .add(MI.getOperand(2)); 1803 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1804 .addReg(Exec); 1805 MI.eraseFromParent(); 1806 break; 1807 } 1808 case AMDGPU::V_SET_INACTIVE_B64: { 1809 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 1810 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1811 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 1812 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 1813 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 1814 MI.getOperand(0).getReg()) 1815 .add(MI.getOperand(2)); 1816 expandPostRAPseudo(*Copy); 1817 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 1818 .addReg(Exec); 1819 MI.eraseFromParent(); 1820 break; 1821 } 1822 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: 1823 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: 1824 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: 1825 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: 1826 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: 1827 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: 1828 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: 1829 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: 1830 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: 1831 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: 1832 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: 1833 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: 1834 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: 1835 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: 1836 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: 1837 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: 1838 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: 1839 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: 1840 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: 1841 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: 1842 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { 1843 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 1844 1845 unsigned Opc; 1846 if (RI.hasVGPRs(EltRC)) { 1847 Opc = AMDGPU::V_MOVRELD_B32_e32; 1848 } else { 1849 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 1850 : AMDGPU::S_MOVRELD_B32; 1851 } 1852 1853 const MCInstrDesc &OpDesc = get(Opc); 1854 Register VecReg = MI.getOperand(0).getReg(); 1855 bool IsUndef = MI.getOperand(1).isUndef(); 1856 unsigned SubReg = MI.getOperand(3).getImm(); 1857 assert(VecReg == MI.getOperand(1).getReg()); 1858 1859 MachineInstrBuilder MIB = 1860 BuildMI(MBB, MI, DL, OpDesc) 1861 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1862 .add(MI.getOperand(2)) 1863 .addReg(VecReg, RegState::ImplicitDefine) 1864 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1865 1866 const int ImpDefIdx = 1867 OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 1868 const int ImpUseIdx = ImpDefIdx + 1; 1869 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 1870 MI.eraseFromParent(); 1871 break; 1872 } 1873 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: 1874 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: 1875 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: 1876 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: 1877 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: 1878 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: 1879 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: 1880 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { 1881 assert(ST.useVGPRIndexMode()); 1882 Register VecReg = MI.getOperand(0).getReg(); 1883 bool IsUndef = MI.getOperand(1).isUndef(); 1884 Register Idx = MI.getOperand(3).getReg(); 1885 Register SubReg = MI.getOperand(4).getImm(); 1886 1887 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 1888 .addReg(Idx) 1889 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 1890 SetOn->getOperand(3).setIsUndef(); 1891 1892 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect); 1893 MachineInstrBuilder MIB = 1894 BuildMI(MBB, MI, DL, OpDesc) 1895 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1896 .add(MI.getOperand(2)) 1897 .addReg(VecReg, RegState::ImplicitDefine) 1898 .addReg(VecReg, 1899 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 1900 1901 const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); 1902 const int ImpUseIdx = ImpDefIdx + 1; 1903 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 1904 1905 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 1906 1907 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 1908 1909 MI.eraseFromParent(); 1910 break; 1911 } 1912 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: 1913 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: 1914 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: 1915 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: 1916 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: 1917 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: 1918 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: 1919 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { 1920 assert(ST.useVGPRIndexMode()); 1921 Register Dst = MI.getOperand(0).getReg(); 1922 Register VecReg = MI.getOperand(1).getReg(); 1923 bool IsUndef = MI.getOperand(1).isUndef(); 1924 Register Idx = MI.getOperand(2).getReg(); 1925 Register SubReg = MI.getOperand(3).getImm(); 1926 1927 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 1928 .addReg(Idx) 1929 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 1930 SetOn->getOperand(3).setIsUndef(); 1931 1932 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32)) 1933 .addDef(Dst) 1934 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 1935 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)) 1936 .addReg(AMDGPU::M0, RegState::Implicit); 1937 1938 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 1939 1940 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 1941 1942 MI.eraseFromParent(); 1943 break; 1944 } 1945 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 1946 MachineFunction &MF = *MBB.getParent(); 1947 Register Reg = MI.getOperand(0).getReg(); 1948 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 1949 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 1950 1951 // Create a bundle so these instructions won't be re-ordered by the 1952 // post-RA scheduler. 1953 MIBundleBuilder Bundler(MBB, MI); 1954 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 1955 1956 // Add 32-bit offset from this instruction to the start of the 1957 // constant data. 1958 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 1959 .addReg(RegLo) 1960 .add(MI.getOperand(1))); 1961 1962 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 1963 .addReg(RegHi); 1964 MIB.add(MI.getOperand(2)); 1965 1966 Bundler.append(MIB); 1967 finalizeBundle(MBB, Bundler.begin()); 1968 1969 MI.eraseFromParent(); 1970 break; 1971 } 1972 case AMDGPU::ENTER_STRICT_WWM: { 1973 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1974 // Whole Wave Mode is entered. 1975 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1976 : AMDGPU::S_OR_SAVEEXEC_B64)); 1977 break; 1978 } 1979 case AMDGPU::ENTER_STRICT_WQM: { 1980 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1981 // STRICT_WQM is entered. 1982 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 1983 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; 1984 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 1985 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); 1986 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); 1987 1988 MI.eraseFromParent(); 1989 break; 1990 } 1991 case AMDGPU::EXIT_STRICT_WWM: 1992 case AMDGPU::EXIT_STRICT_WQM: { 1993 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 1994 // WWM/STICT_WQM is exited. 1995 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 1996 break; 1997 } 1998 } 1999 return true; 2000 } 2001 2002 std::pair<MachineInstr*, MachineInstr*> 2003 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 2004 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 2005 2006 MachineBasicBlock &MBB = *MI.getParent(); 2007 DebugLoc DL = MBB.findDebugLoc(MI); 2008 MachineFunction *MF = MBB.getParent(); 2009 MachineRegisterInfo &MRI = MF->getRegInfo(); 2010 Register Dst = MI.getOperand(0).getReg(); 2011 unsigned Part = 0; 2012 MachineInstr *Split[2]; 2013 2014 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 2015 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 2016 if (Dst.isPhysical()) { 2017 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 2018 } else { 2019 assert(MRI.isSSA()); 2020 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2021 MovDPP.addDef(Tmp); 2022 } 2023 2024 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 2025 const MachineOperand &SrcOp = MI.getOperand(I); 2026 assert(!SrcOp.isFPImm()); 2027 if (SrcOp.isImm()) { 2028 APInt Imm(64, SrcOp.getImm()); 2029 Imm.ashrInPlace(Part * 32); 2030 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 2031 } else { 2032 assert(SrcOp.isReg()); 2033 Register Src = SrcOp.getReg(); 2034 if (Src.isPhysical()) 2035 MovDPP.addReg(RI.getSubReg(Src, Sub)); 2036 else 2037 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 2038 } 2039 } 2040 2041 for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) 2042 MovDPP.addImm(MI.getOperand(I).getImm()); 2043 2044 Split[Part] = MovDPP; 2045 ++Part; 2046 } 2047 2048 if (Dst.isVirtual()) 2049 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 2050 .addReg(Split[0]->getOperand(0).getReg()) 2051 .addImm(AMDGPU::sub0) 2052 .addReg(Split[1]->getOperand(0).getReg()) 2053 .addImm(AMDGPU::sub1); 2054 2055 MI.eraseFromParent(); 2056 return std::make_pair(Split[0], Split[1]); 2057 } 2058 2059 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 2060 MachineOperand &Src0, 2061 unsigned Src0OpName, 2062 MachineOperand &Src1, 2063 unsigned Src1OpName) const { 2064 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 2065 if (!Src0Mods) 2066 return false; 2067 2068 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 2069 assert(Src1Mods && 2070 "All commutable instructions have both src0 and src1 modifiers"); 2071 2072 int Src0ModsVal = Src0Mods->getImm(); 2073 int Src1ModsVal = Src1Mods->getImm(); 2074 2075 Src1Mods->setImm(Src0ModsVal); 2076 Src0Mods->setImm(Src1ModsVal); 2077 return true; 2078 } 2079 2080 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 2081 MachineOperand &RegOp, 2082 MachineOperand &NonRegOp) { 2083 Register Reg = RegOp.getReg(); 2084 unsigned SubReg = RegOp.getSubReg(); 2085 bool IsKill = RegOp.isKill(); 2086 bool IsDead = RegOp.isDead(); 2087 bool IsUndef = RegOp.isUndef(); 2088 bool IsDebug = RegOp.isDebug(); 2089 2090 if (NonRegOp.isImm()) 2091 RegOp.ChangeToImmediate(NonRegOp.getImm()); 2092 else if (NonRegOp.isFI()) 2093 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 2094 else if (NonRegOp.isGlobal()) { 2095 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), 2096 NonRegOp.getTargetFlags()); 2097 } else 2098 return nullptr; 2099 2100 // Make sure we don't reinterpret a subreg index in the target flags. 2101 RegOp.setTargetFlags(NonRegOp.getTargetFlags()); 2102 2103 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 2104 NonRegOp.setSubReg(SubReg); 2105 2106 return &MI; 2107 } 2108 2109 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 2110 unsigned Src0Idx, 2111 unsigned Src1Idx) const { 2112 assert(!NewMI && "this should never be used"); 2113 2114 unsigned Opc = MI.getOpcode(); 2115 int CommutedOpcode = commuteOpcode(Opc); 2116 if (CommutedOpcode == -1) 2117 return nullptr; 2118 2119 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 2120 static_cast<int>(Src0Idx) && 2121 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 2122 static_cast<int>(Src1Idx) && 2123 "inconsistency with findCommutedOpIndices"); 2124 2125 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2126 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2127 2128 MachineInstr *CommutedMI = nullptr; 2129 if (Src0.isReg() && Src1.isReg()) { 2130 if (isOperandLegal(MI, Src1Idx, &Src0)) { 2131 // Be sure to copy the source modifiers to the right place. 2132 CommutedMI 2133 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 2134 } 2135 2136 } else if (Src0.isReg() && !Src1.isReg()) { 2137 // src0 should always be able to support any operand type, so no need to 2138 // check operand legality. 2139 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 2140 } else if (!Src0.isReg() && Src1.isReg()) { 2141 if (isOperandLegal(MI, Src1Idx, &Src0)) 2142 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 2143 } else { 2144 // FIXME: Found two non registers to commute. This does happen. 2145 return nullptr; 2146 } 2147 2148 if (CommutedMI) { 2149 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 2150 Src1, AMDGPU::OpName::src1_modifiers); 2151 2152 CommutedMI->setDesc(get(CommutedOpcode)); 2153 } 2154 2155 return CommutedMI; 2156 } 2157 2158 // This needs to be implemented because the source modifiers may be inserted 2159 // between the true commutable operands, and the base 2160 // TargetInstrInfo::commuteInstruction uses it. 2161 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 2162 unsigned &SrcOpIdx0, 2163 unsigned &SrcOpIdx1) const { 2164 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 2165 } 2166 2167 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, 2168 unsigned &SrcOpIdx1) const { 2169 if (!Desc.isCommutable()) 2170 return false; 2171 2172 unsigned Opc = Desc.getOpcode(); 2173 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2174 if (Src0Idx == -1) 2175 return false; 2176 2177 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2178 if (Src1Idx == -1) 2179 return false; 2180 2181 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 2182 } 2183 2184 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 2185 int64_t BrOffset) const { 2186 // BranchRelaxation should never have to check s_setpc_b64 because its dest 2187 // block is unanalyzable. 2188 assert(BranchOp != AMDGPU::S_SETPC_B64); 2189 2190 // Convert to dwords. 2191 BrOffset /= 4; 2192 2193 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 2194 // from the next instruction. 2195 BrOffset -= 1; 2196 2197 return isIntN(BranchOffsetBits, BrOffset); 2198 } 2199 2200 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 2201 const MachineInstr &MI) const { 2202 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 2203 // This would be a difficult analysis to perform, but can always be legal so 2204 // there's no need to analyze it. 2205 return nullptr; 2206 } 2207 2208 return MI.getOperand(0).getMBB(); 2209 } 2210 2211 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 2212 MachineBasicBlock &DestBB, 2213 const DebugLoc &DL, 2214 int64_t BrOffset, 2215 RegScavenger *RS) const { 2216 assert(RS && "RegScavenger required for long branching"); 2217 assert(MBB.empty() && 2218 "new block should be inserted for expanding unconditional branch"); 2219 assert(MBB.pred_size() == 1); 2220 2221 MachineFunction *MF = MBB.getParent(); 2222 MachineRegisterInfo &MRI = MF->getRegInfo(); 2223 2224 // FIXME: Virtual register workaround for RegScavenger not working with empty 2225 // blocks. 2226 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2227 2228 auto I = MBB.end(); 2229 2230 // We need to compute the offset relative to the instruction immediately after 2231 // s_getpc_b64. Insert pc arithmetic code before last terminator. 2232 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 2233 2234 auto &MCCtx = MF->getContext(); 2235 MCSymbol *PostGetPCLabel = 2236 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); 2237 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); 2238 2239 MCSymbol *OffsetLo = 2240 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); 2241 MCSymbol *OffsetHi = 2242 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); 2243 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 2244 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2245 .addReg(PCReg, 0, AMDGPU::sub0) 2246 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); 2247 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 2248 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2249 .addReg(PCReg, 0, AMDGPU::sub1) 2250 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); 2251 2252 // Insert the indirect branch after the other terminator. 2253 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 2254 .addReg(PCReg); 2255 2256 auto ComputeBlockSize = [](const TargetInstrInfo *TII, 2257 const MachineBasicBlock &MBB) { 2258 unsigned Size = 0; 2259 for (const MachineInstr &MI : MBB) 2260 Size += TII->getInstSizeInBytes(MI); 2261 return Size; 2262 }; 2263 2264 // FIXME: If spilling is necessary, this will fail because this scavenger has 2265 // no emergency stack slots. It is non-trivial to spill in this situation, 2266 // because the restore code needs to be specially placed after the 2267 // jump. BranchRelaxation then needs to be made aware of the newly inserted 2268 // block. 2269 // 2270 // If a spill is needed for the pc register pair, we need to insert a spill 2271 // restore block right before the destination block, and insert a short branch 2272 // into the old destination block's fallthrough predecessor. 2273 // e.g.: 2274 // 2275 // s_cbranch_scc0 skip_long_branch: 2276 // 2277 // long_branch_bb: 2278 // spill s[8:9] 2279 // s_getpc_b64 s[8:9] 2280 // s_add_u32 s8, s8, restore_bb 2281 // s_addc_u32 s9, s9, 0 2282 // s_setpc_b64 s[8:9] 2283 // 2284 // skip_long_branch: 2285 // foo; 2286 // 2287 // ..... 2288 // 2289 // dest_bb_fallthrough_predecessor: 2290 // bar; 2291 // s_branch dest_bb 2292 // 2293 // restore_bb: 2294 // restore s[8:9] 2295 // fallthrough dest_bb 2296 /// 2297 // dest_bb: 2298 // buzz; 2299 2300 RS->enterBasicBlockEnd(MBB); 2301 Register Scav = RS->scavengeRegisterBackwards( 2302 AMDGPU::SReg_64RegClass, 2303 MachineBasicBlock::iterator(GetPC), false, 0); 2304 MRI.replaceRegWith(PCReg, Scav); 2305 MRI.clearVirtRegs(); 2306 RS->setRegUsed(Scav); 2307 2308 // Now, the distance could be defined. 2309 auto *Offset = MCBinaryExpr::createSub( 2310 MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx), 2311 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); 2312 // Add offset assignments. 2313 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); 2314 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); 2315 auto *ShAmt = MCConstantExpr::create(32, MCCtx); 2316 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); 2317 return ComputeBlockSize(this, MBB); 2318 } 2319 2320 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 2321 switch (Cond) { 2322 case SIInstrInfo::SCC_TRUE: 2323 return AMDGPU::S_CBRANCH_SCC1; 2324 case SIInstrInfo::SCC_FALSE: 2325 return AMDGPU::S_CBRANCH_SCC0; 2326 case SIInstrInfo::VCCNZ: 2327 return AMDGPU::S_CBRANCH_VCCNZ; 2328 case SIInstrInfo::VCCZ: 2329 return AMDGPU::S_CBRANCH_VCCZ; 2330 case SIInstrInfo::EXECNZ: 2331 return AMDGPU::S_CBRANCH_EXECNZ; 2332 case SIInstrInfo::EXECZ: 2333 return AMDGPU::S_CBRANCH_EXECZ; 2334 default: 2335 llvm_unreachable("invalid branch predicate"); 2336 } 2337 } 2338 2339 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 2340 switch (Opcode) { 2341 case AMDGPU::S_CBRANCH_SCC0: 2342 return SCC_FALSE; 2343 case AMDGPU::S_CBRANCH_SCC1: 2344 return SCC_TRUE; 2345 case AMDGPU::S_CBRANCH_VCCNZ: 2346 return VCCNZ; 2347 case AMDGPU::S_CBRANCH_VCCZ: 2348 return VCCZ; 2349 case AMDGPU::S_CBRANCH_EXECNZ: 2350 return EXECNZ; 2351 case AMDGPU::S_CBRANCH_EXECZ: 2352 return EXECZ; 2353 default: 2354 return INVALID_BR; 2355 } 2356 } 2357 2358 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 2359 MachineBasicBlock::iterator I, 2360 MachineBasicBlock *&TBB, 2361 MachineBasicBlock *&FBB, 2362 SmallVectorImpl<MachineOperand> &Cond, 2363 bool AllowModify) const { 2364 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2365 // Unconditional Branch 2366 TBB = I->getOperand(0).getMBB(); 2367 return false; 2368 } 2369 2370 MachineBasicBlock *CondBB = nullptr; 2371 2372 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 2373 CondBB = I->getOperand(1).getMBB(); 2374 Cond.push_back(I->getOperand(0)); 2375 } else { 2376 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 2377 if (Pred == INVALID_BR) 2378 return true; 2379 2380 CondBB = I->getOperand(0).getMBB(); 2381 Cond.push_back(MachineOperand::CreateImm(Pred)); 2382 Cond.push_back(I->getOperand(1)); // Save the branch register. 2383 } 2384 ++I; 2385 2386 if (I == MBB.end()) { 2387 // Conditional branch followed by fall-through. 2388 TBB = CondBB; 2389 return false; 2390 } 2391 2392 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2393 TBB = CondBB; 2394 FBB = I->getOperand(0).getMBB(); 2395 return false; 2396 } 2397 2398 return true; 2399 } 2400 2401 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 2402 MachineBasicBlock *&FBB, 2403 SmallVectorImpl<MachineOperand> &Cond, 2404 bool AllowModify) const { 2405 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2406 auto E = MBB.end(); 2407 if (I == E) 2408 return false; 2409 2410 // Skip over the instructions that are artificially terminators for special 2411 // exec management. 2412 while (I != E && !I->isBranch() && !I->isReturn()) { 2413 switch (I->getOpcode()) { 2414 case AMDGPU::S_MOV_B64_term: 2415 case AMDGPU::S_XOR_B64_term: 2416 case AMDGPU::S_OR_B64_term: 2417 case AMDGPU::S_ANDN2_B64_term: 2418 case AMDGPU::S_AND_B64_term: 2419 case AMDGPU::S_MOV_B32_term: 2420 case AMDGPU::S_XOR_B32_term: 2421 case AMDGPU::S_OR_B32_term: 2422 case AMDGPU::S_ANDN2_B32_term: 2423 case AMDGPU::S_AND_B32_term: 2424 break; 2425 case AMDGPU::SI_IF: 2426 case AMDGPU::SI_ELSE: 2427 case AMDGPU::SI_KILL_I1_TERMINATOR: 2428 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 2429 // FIXME: It's messy that these need to be considered here at all. 2430 return true; 2431 default: 2432 llvm_unreachable("unexpected non-branch terminator inst"); 2433 } 2434 2435 ++I; 2436 } 2437 2438 if (I == E) 2439 return false; 2440 2441 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 2442 } 2443 2444 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2445 int *BytesRemoved) const { 2446 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2447 2448 unsigned Count = 0; 2449 unsigned RemovedSize = 0; 2450 while (I != MBB.end()) { 2451 MachineBasicBlock::iterator Next = std::next(I); 2452 RemovedSize += getInstSizeInBytes(*I); 2453 I->eraseFromParent(); 2454 ++Count; 2455 I = Next; 2456 } 2457 2458 if (BytesRemoved) 2459 *BytesRemoved = RemovedSize; 2460 2461 return Count; 2462 } 2463 2464 // Copy the flags onto the implicit condition register operand. 2465 static void preserveCondRegFlags(MachineOperand &CondReg, 2466 const MachineOperand &OrigCond) { 2467 CondReg.setIsUndef(OrigCond.isUndef()); 2468 CondReg.setIsKill(OrigCond.isKill()); 2469 } 2470 2471 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2472 MachineBasicBlock *TBB, 2473 MachineBasicBlock *FBB, 2474 ArrayRef<MachineOperand> Cond, 2475 const DebugLoc &DL, 2476 int *BytesAdded) const { 2477 if (!FBB && Cond.empty()) { 2478 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2479 .addMBB(TBB); 2480 if (BytesAdded) 2481 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 2482 return 1; 2483 } 2484 2485 if(Cond.size() == 1 && Cond[0].isReg()) { 2486 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2487 .add(Cond[0]) 2488 .addMBB(TBB); 2489 return 1; 2490 } 2491 2492 assert(TBB && Cond[0].isImm()); 2493 2494 unsigned Opcode 2495 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2496 2497 if (!FBB) { 2498 Cond[1].isUndef(); 2499 MachineInstr *CondBr = 2500 BuildMI(&MBB, DL, get(Opcode)) 2501 .addMBB(TBB); 2502 2503 // Copy the flags onto the implicit condition register operand. 2504 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2505 fixImplicitOperands(*CondBr); 2506 2507 if (BytesAdded) 2508 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 2509 return 1; 2510 } 2511 2512 assert(TBB && FBB); 2513 2514 MachineInstr *CondBr = 2515 BuildMI(&MBB, DL, get(Opcode)) 2516 .addMBB(TBB); 2517 fixImplicitOperands(*CondBr); 2518 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2519 .addMBB(FBB); 2520 2521 MachineOperand &CondReg = CondBr->getOperand(1); 2522 CondReg.setIsUndef(Cond[1].isUndef()); 2523 CondReg.setIsKill(Cond[1].isKill()); 2524 2525 if (BytesAdded) 2526 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; 2527 2528 return 2; 2529 } 2530 2531 bool SIInstrInfo::reverseBranchCondition( 2532 SmallVectorImpl<MachineOperand> &Cond) const { 2533 if (Cond.size() != 2) { 2534 return true; 2535 } 2536 2537 if (Cond[0].isImm()) { 2538 Cond[0].setImm(-Cond[0].getImm()); 2539 return false; 2540 } 2541 2542 return true; 2543 } 2544 2545 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2546 ArrayRef<MachineOperand> Cond, 2547 Register DstReg, Register TrueReg, 2548 Register FalseReg, int &CondCycles, 2549 int &TrueCycles, int &FalseCycles) const { 2550 switch (Cond[0].getImm()) { 2551 case VCCNZ: 2552 case VCCZ: { 2553 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2554 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2555 if (MRI.getRegClass(FalseReg) != RC) 2556 return false; 2557 2558 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2559 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2560 2561 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2562 return RI.hasVGPRs(RC) && NumInsts <= 6; 2563 } 2564 case SCC_TRUE: 2565 case SCC_FALSE: { 2566 // FIXME: We could insert for VGPRs if we could replace the original compare 2567 // with a vector one. 2568 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2569 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2570 if (MRI.getRegClass(FalseReg) != RC) 2571 return false; 2572 2573 int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; 2574 2575 // Multiples of 8 can do s_cselect_b64 2576 if (NumInsts % 2 == 0) 2577 NumInsts /= 2; 2578 2579 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2580 return RI.isSGPRClass(RC); 2581 } 2582 default: 2583 return false; 2584 } 2585 } 2586 2587 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2588 MachineBasicBlock::iterator I, const DebugLoc &DL, 2589 Register DstReg, ArrayRef<MachineOperand> Cond, 2590 Register TrueReg, Register FalseReg) const { 2591 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2592 if (Pred == VCCZ || Pred == SCC_FALSE) { 2593 Pred = static_cast<BranchPredicate>(-Pred); 2594 std::swap(TrueReg, FalseReg); 2595 } 2596 2597 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2598 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2599 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2600 2601 if (DstSize == 32) { 2602 MachineInstr *Select; 2603 if (Pred == SCC_TRUE) { 2604 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) 2605 .addReg(TrueReg) 2606 .addReg(FalseReg); 2607 } else { 2608 // Instruction's operands are backwards from what is expected. 2609 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) 2610 .addReg(FalseReg) 2611 .addReg(TrueReg); 2612 } 2613 2614 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2615 return; 2616 } 2617 2618 if (DstSize == 64 && Pred == SCC_TRUE) { 2619 MachineInstr *Select = 2620 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 2621 .addReg(TrueReg) 2622 .addReg(FalseReg); 2623 2624 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2625 return; 2626 } 2627 2628 static const int16_t Sub0_15[] = { 2629 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 2630 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 2631 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 2632 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 2633 }; 2634 2635 static const int16_t Sub0_15_64[] = { 2636 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 2637 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 2638 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 2639 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 2640 }; 2641 2642 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 2643 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 2644 const int16_t *SubIndices = Sub0_15; 2645 int NElts = DstSize / 32; 2646 2647 // 64-bit select is only available for SALU. 2648 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 2649 if (Pred == SCC_TRUE) { 2650 if (NElts % 2) { 2651 SelOp = AMDGPU::S_CSELECT_B32; 2652 EltRC = &AMDGPU::SGPR_32RegClass; 2653 } else { 2654 SelOp = AMDGPU::S_CSELECT_B64; 2655 EltRC = &AMDGPU::SGPR_64RegClass; 2656 SubIndices = Sub0_15_64; 2657 NElts /= 2; 2658 } 2659 } 2660 2661 MachineInstrBuilder MIB = BuildMI( 2662 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 2663 2664 I = MIB->getIterator(); 2665 2666 SmallVector<Register, 8> Regs; 2667 for (int Idx = 0; Idx != NElts; ++Idx) { 2668 Register DstElt = MRI.createVirtualRegister(EltRC); 2669 Regs.push_back(DstElt); 2670 2671 unsigned SubIdx = SubIndices[Idx]; 2672 2673 MachineInstr *Select; 2674 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { 2675 Select = 2676 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2677 .addReg(FalseReg, 0, SubIdx) 2678 .addReg(TrueReg, 0, SubIdx); 2679 } else { 2680 Select = 2681 BuildMI(MBB, I, DL, get(SelOp), DstElt) 2682 .addReg(TrueReg, 0, SubIdx) 2683 .addReg(FalseReg, 0, SubIdx); 2684 } 2685 2686 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2687 fixImplicitOperands(*Select); 2688 2689 MIB.addReg(DstElt) 2690 .addImm(SubIdx); 2691 } 2692 } 2693 2694 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { 2695 switch (MI.getOpcode()) { 2696 case AMDGPU::V_MOV_B32_e32: 2697 case AMDGPU::V_MOV_B32_e64: 2698 case AMDGPU::V_MOV_B64_PSEUDO: { 2699 // If there are additional implicit register operands, this may be used for 2700 // register indexing so the source register operand isn't simply copied. 2701 unsigned NumOps = MI.getDesc().getNumOperands() + 2702 MI.getDesc().getNumImplicitUses(); 2703 2704 return MI.getNumOperands() == NumOps; 2705 } 2706 case AMDGPU::S_MOV_B32: 2707 case AMDGPU::S_MOV_B64: 2708 case AMDGPU::COPY: 2709 case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 2710 case AMDGPU::V_ACCVGPR_READ_B32_e64: 2711 case AMDGPU::V_ACCVGPR_MOV_B32: 2712 return true; 2713 default: 2714 return false; 2715 } 2716 } 2717 2718 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( 2719 unsigned Kind) const { 2720 switch(Kind) { 2721 case PseudoSourceValue::Stack: 2722 case PseudoSourceValue::FixedStack: 2723 return AMDGPUAS::PRIVATE_ADDRESS; 2724 case PseudoSourceValue::ConstantPool: 2725 case PseudoSourceValue::GOT: 2726 case PseudoSourceValue::JumpTable: 2727 case PseudoSourceValue::GlobalValueCallEntry: 2728 case PseudoSourceValue::ExternalSymbolCallEntry: 2729 case PseudoSourceValue::TargetCustom: 2730 return AMDGPUAS::CONSTANT_ADDRESS; 2731 } 2732 return AMDGPUAS::FLAT_ADDRESS; 2733 } 2734 2735 static void removeModOperands(MachineInstr &MI) { 2736 unsigned Opc = MI.getOpcode(); 2737 int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2738 AMDGPU::OpName::src0_modifiers); 2739 int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2740 AMDGPU::OpName::src1_modifiers); 2741 int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc, 2742 AMDGPU::OpName::src2_modifiers); 2743 2744 MI.RemoveOperand(Src2ModIdx); 2745 MI.RemoveOperand(Src1ModIdx); 2746 MI.RemoveOperand(Src0ModIdx); 2747 } 2748 2749 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 2750 Register Reg, MachineRegisterInfo *MRI) const { 2751 if (!MRI->hasOneNonDBGUse(Reg)) 2752 return false; 2753 2754 switch (DefMI.getOpcode()) { 2755 default: 2756 return false; 2757 case AMDGPU::S_MOV_B64: 2758 // TODO: We could fold 64-bit immediates, but this get compilicated 2759 // when there are sub-registers. 2760 return false; 2761 2762 case AMDGPU::V_MOV_B32_e32: 2763 case AMDGPU::S_MOV_B32: 2764 case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 2765 break; 2766 } 2767 2768 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 2769 assert(ImmOp); 2770 // FIXME: We could handle FrameIndex values here. 2771 if (!ImmOp->isImm()) 2772 return false; 2773 2774 unsigned Opc = UseMI.getOpcode(); 2775 if (Opc == AMDGPU::COPY) { 2776 Register DstReg = UseMI.getOperand(0).getReg(); 2777 bool Is16Bit = getOpSize(UseMI, 0) == 2; 2778 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); 2779 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 2780 APInt Imm(32, ImmOp->getImm()); 2781 2782 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) 2783 Imm = Imm.ashr(16); 2784 2785 if (RI.isAGPR(*MRI, DstReg)) { 2786 if (!isInlineConstant(Imm)) 2787 return false; 2788 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 2789 } 2790 2791 if (Is16Bit) { 2792 if (isVGPRCopy) 2793 return false; // Do not clobber vgpr_hi16 2794 2795 if (DstReg.isVirtual() && 2796 UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) 2797 return false; 2798 2799 UseMI.getOperand(0).setSubReg(0); 2800 if (DstReg.isPhysical()) { 2801 DstReg = RI.get32BitRegister(DstReg); 2802 UseMI.getOperand(0).setReg(DstReg); 2803 } 2804 assert(UseMI.getOperand(1).getReg().isVirtual()); 2805 } 2806 2807 UseMI.setDesc(get(NewOpc)); 2808 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); 2809 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 2810 return true; 2811 } 2812 2813 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 2814 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 2815 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 2816 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) { 2817 // Don't fold if we are using source or output modifiers. The new VOP2 2818 // instructions don't have them. 2819 if (hasAnyModifiersSet(UseMI)) 2820 return false; 2821 2822 // If this is a free constant, there's no reason to do this. 2823 // TODO: We could fold this here instead of letting SIFoldOperands do it 2824 // later. 2825 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 2826 2827 // Any src operand can be used for the legality check. 2828 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 2829 return false; 2830 2831 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 2832 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; 2833 bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 2834 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64; 2835 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 2836 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 2837 2838 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 2839 // We should only expect these to be on src0 due to canonicalizations. 2840 if (Src0->isReg() && Src0->getReg() == Reg) { 2841 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 2842 return false; 2843 2844 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 2845 return false; 2846 2847 unsigned NewOpc = 2848 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16) 2849 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 2850 if (pseudoToMCOpcode(NewOpc) == -1) 2851 return false; 2852 2853 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 2854 2855 const int64_t Imm = ImmOp->getImm(); 2856 2857 // FIXME: This would be a lot easier if we could return a new instruction 2858 // instead of having to modify in place. 2859 2860 // Remove these first since they are at the end. 2861 UseMI.RemoveOperand( 2862 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2863 UseMI.RemoveOperand( 2864 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2865 2866 Register Src1Reg = Src1->getReg(); 2867 unsigned Src1SubReg = Src1->getSubReg(); 2868 Src0->setReg(Src1Reg); 2869 Src0->setSubReg(Src1SubReg); 2870 Src0->setIsKill(Src1->isKill()); 2871 2872 if (Opc == AMDGPU::V_MAC_F32_e64 || 2873 Opc == AMDGPU::V_MAC_F16_e64 || 2874 Opc == AMDGPU::V_FMAC_F32_e64 || 2875 Opc == AMDGPU::V_FMAC_F16_e64) 2876 UseMI.untieRegOperand( 2877 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2878 2879 Src1->ChangeToImmediate(Imm); 2880 2881 removeModOperands(UseMI); 2882 UseMI.setDesc(get(NewOpc)); 2883 2884 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2885 if (DeleteDef) 2886 DefMI.eraseFromParent(); 2887 2888 return true; 2889 } 2890 2891 // Added part is the constant: Use v_madak_{f16, f32}. 2892 if (Src2->isReg() && Src2->getReg() == Reg) { 2893 // Not allowed to use constant bus for another operand. 2894 // We can however allow an inline immediate as src0. 2895 bool Src0Inlined = false; 2896 if (Src0->isReg()) { 2897 // Try to inline constant if possible. 2898 // If the Def moves immediate and the use is single 2899 // We are saving VGPR here. 2900 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 2901 if (Def && Def->isMoveImmediate() && 2902 isInlineConstant(Def->getOperand(1)) && 2903 MRI->hasOneUse(Src0->getReg())) { 2904 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2905 Src0Inlined = true; 2906 } else if ((Src0->getReg().isPhysical() && 2907 (ST.getConstantBusLimit(Opc) <= 1 && 2908 RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || 2909 (Src0->getReg().isVirtual() && 2910 (ST.getConstantBusLimit(Opc) <= 1 && 2911 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 2912 return false; 2913 // VGPR is okay as Src0 - fallthrough 2914 } 2915 2916 if (Src1->isReg() && !Src0Inlined ) { 2917 // We have one slot for inlinable constant so far - try to fill it 2918 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 2919 if (Def && Def->isMoveImmediate() && 2920 isInlineConstant(Def->getOperand(1)) && 2921 MRI->hasOneUse(Src1->getReg()) && 2922 commuteInstruction(UseMI)) { 2923 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 2924 } else if ((Src1->getReg().isPhysical() && 2925 RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || 2926 (Src1->getReg().isVirtual() && 2927 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 2928 return false; 2929 // VGPR is okay as Src1 - fallthrough 2930 } 2931 2932 unsigned NewOpc = 2933 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16) 2934 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 2935 if (pseudoToMCOpcode(NewOpc) == -1) 2936 return false; 2937 2938 const int64_t Imm = ImmOp->getImm(); 2939 2940 // FIXME: This would be a lot easier if we could return a new instruction 2941 // instead of having to modify in place. 2942 2943 // Remove these first since they are at the end. 2944 UseMI.RemoveOperand( 2945 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); 2946 UseMI.RemoveOperand( 2947 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); 2948 2949 if (Opc == AMDGPU::V_MAC_F32_e64 || 2950 Opc == AMDGPU::V_MAC_F16_e64 || 2951 Opc == AMDGPU::V_FMAC_F32_e64 || 2952 Opc == AMDGPU::V_FMAC_F16_e64) 2953 UseMI.untieRegOperand( 2954 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 2955 2956 // ChangingToImmediate adds Src2 back to the instruction. 2957 Src2->ChangeToImmediate(Imm); 2958 2959 // These come before src2. 2960 removeModOperands(UseMI); 2961 UseMI.setDesc(get(NewOpc)); 2962 // It might happen that UseMI was commuted 2963 // and we now have SGPR as SRC1. If so 2 inlined 2964 // constant and SGPR are illegal. 2965 legalizeOperands(UseMI); 2966 2967 bool DeleteDef = MRI->hasOneNonDBGUse(Reg); 2968 if (DeleteDef) 2969 DefMI.eraseFromParent(); 2970 2971 return true; 2972 } 2973 } 2974 2975 return false; 2976 } 2977 2978 static bool 2979 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 2980 ArrayRef<const MachineOperand *> BaseOps2) { 2981 if (BaseOps1.size() != BaseOps2.size()) 2982 return false; 2983 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { 2984 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 2985 return false; 2986 } 2987 return true; 2988 } 2989 2990 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 2991 int WidthB, int OffsetB) { 2992 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 2993 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 2994 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 2995 return LowOffset + LowWidth <= HighOffset; 2996 } 2997 2998 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 2999 const MachineInstr &MIb) const { 3000 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 3001 int64_t Offset0, Offset1; 3002 unsigned Dummy0, Dummy1; 3003 bool Offset0IsScalable, Offset1IsScalable; 3004 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, 3005 Dummy0, &RI) || 3006 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, 3007 Dummy1, &RI)) 3008 return false; 3009 3010 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 3011 return false; 3012 3013 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 3014 // FIXME: Handle ds_read2 / ds_write2. 3015 return false; 3016 } 3017 unsigned Width0 = MIa.memoperands().front()->getSize(); 3018 unsigned Width1 = MIb.memoperands().front()->getSize(); 3019 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 3020 } 3021 3022 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 3023 const MachineInstr &MIb) const { 3024 assert(MIa.mayLoadOrStore() && 3025 "MIa must load from or modify a memory location"); 3026 assert(MIb.mayLoadOrStore() && 3027 "MIb must load from or modify a memory location"); 3028 3029 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 3030 return false; 3031 3032 // XXX - Can we relax this between address spaces? 3033 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 3034 return false; 3035 3036 // TODO: Should we check the address space from the MachineMemOperand? That 3037 // would allow us to distinguish objects we know don't alias based on the 3038 // underlying address space, even if it was lowered to a different one, 3039 // e.g. private accesses lowered to use MUBUF instructions on a scratch 3040 // buffer. 3041 if (isDS(MIa)) { 3042 if (isDS(MIb)) 3043 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3044 3045 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 3046 } 3047 3048 if (isMUBUF(MIa) || isMTBUF(MIa)) { 3049 if (isMUBUF(MIb) || isMTBUF(MIb)) 3050 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3051 3052 return !isFLAT(MIb) && !isSMRD(MIb); 3053 } 3054 3055 if (isSMRD(MIa)) { 3056 if (isSMRD(MIb)) 3057 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3058 3059 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); 3060 } 3061 3062 if (isFLAT(MIa)) { 3063 if (isFLAT(MIb)) 3064 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3065 3066 return false; 3067 } 3068 3069 return false; 3070 } 3071 3072 static int64_t getFoldableImm(const MachineOperand* MO) { 3073 if (!MO->isReg()) 3074 return false; 3075 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 3076 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3077 auto Def = MRI.getUniqueVRegDef(MO->getReg()); 3078 if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && 3079 Def->getOperand(1).isImm()) 3080 return Def->getOperand(1).getImm(); 3081 return AMDGPU::NoRegister; 3082 } 3083 3084 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, 3085 MachineInstr &NewMI) { 3086 if (LV) { 3087 unsigned NumOps = MI.getNumOperands(); 3088 for (unsigned I = 1; I < NumOps; ++I) { 3089 MachineOperand &Op = MI.getOperand(I); 3090 if (Op.isReg() && Op.isKill()) 3091 LV->replaceKillInstruction(Op.getReg(), MI, NewMI); 3092 } 3093 } 3094 } 3095 3096 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, 3097 MachineInstr &MI, 3098 LiveVariables *LV) const { 3099 unsigned Opc = MI.getOpcode(); 3100 bool IsF16 = false; 3101 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 3102 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || 3103 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 3104 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 3105 3106 switch (Opc) { 3107 default: 3108 return nullptr; 3109 case AMDGPU::V_MAC_F16_e64: 3110 case AMDGPU::V_FMAC_F16_e64: 3111 IsF16 = true; 3112 LLVM_FALLTHROUGH; 3113 case AMDGPU::V_MAC_F32_e64: 3114 case AMDGPU::V_FMAC_F32_e64: 3115 case AMDGPU::V_FMAC_F64_e64: 3116 break; 3117 case AMDGPU::V_MAC_F16_e32: 3118 case AMDGPU::V_FMAC_F16_e32: 3119 IsF16 = true; 3120 LLVM_FALLTHROUGH; 3121 case AMDGPU::V_MAC_F32_e32: 3122 case AMDGPU::V_FMAC_F32_e32: 3123 case AMDGPU::V_FMAC_F64_e32: { 3124 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3125 AMDGPU::OpName::src0); 3126 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 3127 if (!Src0->isReg() && !Src0->isImm()) 3128 return nullptr; 3129 3130 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 3131 return nullptr; 3132 3133 break; 3134 } 3135 } 3136 3137 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3138 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 3139 const MachineOperand *Src0Mods = 3140 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 3141 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3142 const MachineOperand *Src1Mods = 3143 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 3144 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3145 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3146 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 3147 MachineInstrBuilder MIB; 3148 3149 if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && 3150 // If we have an SGPR input, we will violate the constant bus restriction. 3151 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || 3152 !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { 3153 if (auto Imm = getFoldableImm(Src2)) { 3154 unsigned NewOpc = 3155 IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) 3156 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 3157 if (pseudoToMCOpcode(NewOpc) != -1) { 3158 MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3159 .add(*Dst) 3160 .add(*Src0) 3161 .add(*Src1) 3162 .addImm(Imm); 3163 updateLiveVariables(LV, MI, *MIB); 3164 return MIB; 3165 } 3166 } 3167 unsigned NewOpc = IsFMA 3168 ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) 3169 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 3170 if (auto Imm = getFoldableImm(Src1)) { 3171 if (pseudoToMCOpcode(NewOpc) != -1) { 3172 MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3173 .add(*Dst) 3174 .add(*Src0) 3175 .addImm(Imm) 3176 .add(*Src2); 3177 updateLiveVariables(LV, MI, *MIB); 3178 return MIB; 3179 } 3180 } 3181 if (auto Imm = getFoldableImm(Src0)) { 3182 if (pseudoToMCOpcode(NewOpc) != -1 && 3183 isOperandLegal( 3184 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), 3185 Src1)) { 3186 MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3187 .add(*Dst) 3188 .add(*Src1) 3189 .addImm(Imm) 3190 .add(*Src2); 3191 updateLiveVariables(LV, MI, *MIB); 3192 return MIB; 3193 } 3194 } 3195 } 3196 3197 unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 3198 : IsF64 ? AMDGPU::V_FMA_F64_e64 3199 : AMDGPU::V_FMA_F32_e64) 3200 : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); 3201 if (pseudoToMCOpcode(NewOpc) == -1) 3202 return nullptr; 3203 3204 MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3205 .add(*Dst) 3206 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 3207 .add(*Src0) 3208 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 3209 .add(*Src1) 3210 .addImm(0) // Src mods 3211 .add(*Src2) 3212 .addImm(Clamp ? Clamp->getImm() : 0) 3213 .addImm(Omod ? Omod->getImm() : 0); 3214 updateLiveVariables(LV, MI, *MIB); 3215 return MIB; 3216 } 3217 3218 // It's not generally safe to move VALU instructions across these since it will 3219 // start using the register as a base index rather than directly. 3220 // XXX - Why isn't hasSideEffects sufficient for these? 3221 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 3222 switch (MI.getOpcode()) { 3223 case AMDGPU::S_SET_GPR_IDX_ON: 3224 case AMDGPU::S_SET_GPR_IDX_MODE: 3225 case AMDGPU::S_SET_GPR_IDX_OFF: 3226 return true; 3227 default: 3228 return false; 3229 } 3230 } 3231 3232 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 3233 const MachineBasicBlock *MBB, 3234 const MachineFunction &MF) const { 3235 // Skipping the check for SP writes in the base implementation. The reason it 3236 // was added was apparently due to compile time concerns. 3237 // 3238 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops 3239 // but is probably avoidable. 3240 3241 // Copied from base implementation. 3242 // Terminators and labels can't be scheduled around. 3243 if (MI.isTerminator() || MI.isPosition()) 3244 return true; 3245 3246 // INLINEASM_BR can jump to another block 3247 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) 3248 return true; 3249 3250 // Target-independent instructions do not have an implicit-use of EXEC, even 3251 // when they operate on VGPRs. Treating EXEC modifications as scheduling 3252 // boundaries prevents incorrect movements of such instructions. 3253 return MI.modifiesRegister(AMDGPU::EXEC, &RI) || 3254 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 3255 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 3256 changesVGPRIndexingMode(MI); 3257 } 3258 3259 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 3260 return Opcode == AMDGPU::DS_ORDERED_COUNT || 3261 Opcode == AMDGPU::DS_GWS_INIT || 3262 Opcode == AMDGPU::DS_GWS_SEMA_V || 3263 Opcode == AMDGPU::DS_GWS_SEMA_BR || 3264 Opcode == AMDGPU::DS_GWS_SEMA_P || 3265 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 3266 Opcode == AMDGPU::DS_GWS_BARRIER; 3267 } 3268 3269 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { 3270 // Skip the full operand and register alias search modifiesRegister 3271 // does. There's only a handful of instructions that touch this, it's only an 3272 // implicit def, and doesn't alias any other registers. 3273 if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { 3274 for (; ImpDef && *ImpDef; ++ImpDef) { 3275 if (*ImpDef == AMDGPU::MODE) 3276 return true; 3277 } 3278 } 3279 3280 return false; 3281 } 3282 3283 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 3284 unsigned Opcode = MI.getOpcode(); 3285 3286 if (MI.mayStore() && isSMRD(MI)) 3287 return true; // scalar store or atomic 3288 3289 // This will terminate the function when other lanes may need to continue. 3290 if (MI.isReturn()) 3291 return true; 3292 3293 // These instructions cause shader I/O that may cause hardware lockups 3294 // when executed with an empty EXEC mask. 3295 // 3296 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 3297 // EXEC = 0, but checking for that case here seems not worth it 3298 // given the typical code patterns. 3299 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 3300 isEXP(Opcode) || 3301 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 3302 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 3303 return true; 3304 3305 if (MI.isCall() || MI.isInlineAsm()) 3306 return true; // conservative assumption 3307 3308 // A mode change is a scalar operation that influences vector instructions. 3309 if (modifiesModeRegister(MI)) 3310 return true; 3311 3312 // These are like SALU instructions in terms of effects, so it's questionable 3313 // whether we should return true for those. 3314 // 3315 // However, executing them with EXEC = 0 causes them to operate on undefined 3316 // data, which we avoid by returning true here. 3317 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || 3318 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) 3319 return true; 3320 3321 return false; 3322 } 3323 3324 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 3325 const MachineInstr &MI) const { 3326 if (MI.isMetaInstruction()) 3327 return false; 3328 3329 // This won't read exec if this is an SGPR->SGPR copy. 3330 if (MI.isCopyLike()) { 3331 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 3332 return true; 3333 3334 // Make sure this isn't copying exec as a normal operand 3335 return MI.readsRegister(AMDGPU::EXEC, &RI); 3336 } 3337 3338 // Make a conservative assumption about the callee. 3339 if (MI.isCall()) 3340 return true; 3341 3342 // Be conservative with any unhandled generic opcodes. 3343 if (!isTargetSpecificOpcode(MI.getOpcode())) 3344 return true; 3345 3346 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 3347 } 3348 3349 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 3350 switch (Imm.getBitWidth()) { 3351 case 1: // This likely will be a condition code mask. 3352 return true; 3353 3354 case 32: 3355 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 3356 ST.hasInv2PiInlineImm()); 3357 case 64: 3358 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 3359 ST.hasInv2PiInlineImm()); 3360 case 16: 3361 return ST.has16BitInsts() && 3362 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 3363 ST.hasInv2PiInlineImm()); 3364 default: 3365 llvm_unreachable("invalid bitwidth"); 3366 } 3367 } 3368 3369 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 3370 uint8_t OperandType) const { 3371 if (!MO.isImm() || 3372 OperandType < AMDGPU::OPERAND_SRC_FIRST || 3373 OperandType > AMDGPU::OPERAND_SRC_LAST) 3374 return false; 3375 3376 // MachineOperand provides no way to tell the true operand size, since it only 3377 // records a 64-bit value. We need to know the size to determine if a 32-bit 3378 // floating point immediate bit pattern is legal for an integer immediate. It 3379 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 3380 3381 int64_t Imm = MO.getImm(); 3382 switch (OperandType) { 3383 case AMDGPU::OPERAND_REG_IMM_INT32: 3384 case AMDGPU::OPERAND_REG_IMM_FP32: 3385 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3386 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3387 case AMDGPU::OPERAND_REG_IMM_V2FP32: 3388 case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: 3389 case AMDGPU::OPERAND_REG_IMM_V2INT32: 3390 case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: 3391 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3392 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 3393 int32_t Trunc = static_cast<int32_t>(Imm); 3394 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 3395 } 3396 case AMDGPU::OPERAND_REG_IMM_INT64: 3397 case AMDGPU::OPERAND_REG_IMM_FP64: 3398 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3399 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3400 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: 3401 return AMDGPU::isInlinableLiteral64(MO.getImm(), 3402 ST.hasInv2PiInlineImm()); 3403 case AMDGPU::OPERAND_REG_IMM_INT16: 3404 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3405 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3406 // We would expect inline immediates to not be concerned with an integer/fp 3407 // distinction. However, in the case of 16-bit integer operations, the 3408 // "floating point" values appear to not work. It seems read the low 16-bits 3409 // of 32-bit immediates, which happens to always work for the integer 3410 // values. 3411 // 3412 // See llvm bugzilla 46302. 3413 // 3414 // TODO: Theoretically we could use op-sel to use the high bits of the 3415 // 32-bit FP values. 3416 return AMDGPU::isInlinableIntLiteral(Imm); 3417 case AMDGPU::OPERAND_REG_IMM_V2INT16: 3418 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 3419 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 3420 // This suffers the same problem as the scalar 16-bit cases. 3421 return AMDGPU::isInlinableIntLiteralV216(Imm); 3422 case AMDGPU::OPERAND_REG_IMM_FP16: 3423 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3424 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3425 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 3426 // A few special case instructions have 16-bit operands on subtargets 3427 // where 16-bit instructions are not legal. 3428 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 3429 // constants in these cases 3430 int16_t Trunc = static_cast<int16_t>(Imm); 3431 return ST.has16BitInsts() && 3432 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 3433 } 3434 3435 return false; 3436 } 3437 case AMDGPU::OPERAND_REG_IMM_V2FP16: 3438 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 3439 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 3440 uint32_t Trunc = static_cast<uint32_t>(Imm); 3441 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 3442 } 3443 default: 3444 llvm_unreachable("invalid bitwidth"); 3445 } 3446 } 3447 3448 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, 3449 const MCOperandInfo &OpInfo) const { 3450 switch (MO.getType()) { 3451 case MachineOperand::MO_Register: 3452 return false; 3453 case MachineOperand::MO_Immediate: 3454 return !isInlineConstant(MO, OpInfo); 3455 case MachineOperand::MO_FrameIndex: 3456 case MachineOperand::MO_MachineBasicBlock: 3457 case MachineOperand::MO_ExternalSymbol: 3458 case MachineOperand::MO_GlobalAddress: 3459 case MachineOperand::MO_MCSymbol: 3460 return true; 3461 default: 3462 llvm_unreachable("unexpected operand type"); 3463 } 3464 } 3465 3466 static bool compareMachineOp(const MachineOperand &Op0, 3467 const MachineOperand &Op1) { 3468 if (Op0.getType() != Op1.getType()) 3469 return false; 3470 3471 switch (Op0.getType()) { 3472 case MachineOperand::MO_Register: 3473 return Op0.getReg() == Op1.getReg(); 3474 case MachineOperand::MO_Immediate: 3475 return Op0.getImm() == Op1.getImm(); 3476 default: 3477 llvm_unreachable("Didn't expect to be comparing these operand types"); 3478 } 3479 } 3480 3481 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 3482 const MachineOperand &MO) const { 3483 const MCInstrDesc &InstDesc = MI.getDesc(); 3484 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo]; 3485 3486 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3487 3488 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 3489 return true; 3490 3491 if (OpInfo.RegClass < 0) 3492 return false; 3493 3494 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 3495 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 3496 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3497 AMDGPU::OpName::src2)) 3498 return false; 3499 return RI.opCanUseInlineConstant(OpInfo.OperandType); 3500 } 3501 3502 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 3503 return false; 3504 3505 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 3506 return true; 3507 3508 return ST.hasVOP3Literal(); 3509 } 3510 3511 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 3512 // GFX90A does not have V_MUL_LEGACY_F32_e32. 3513 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) 3514 return false; 3515 3516 int Op32 = AMDGPU::getVOPe32(Opcode); 3517 if (Op32 == -1) 3518 return false; 3519 3520 return pseudoToMCOpcode(Op32) != -1; 3521 } 3522 3523 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 3524 // The src0_modifier operand is present on all instructions 3525 // that have modifiers. 3526 3527 return AMDGPU::getNamedOperandIdx(Opcode, 3528 AMDGPU::OpName::src0_modifiers) != -1; 3529 } 3530 3531 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 3532 unsigned OpName) const { 3533 const MachineOperand *Mods = getNamedOperand(MI, OpName); 3534 return Mods && Mods->getImm(); 3535 } 3536 3537 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 3538 return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || 3539 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || 3540 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || 3541 hasModifiersSet(MI, AMDGPU::OpName::clamp) || 3542 hasModifiersSet(MI, AMDGPU::OpName::omod); 3543 } 3544 3545 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3546 const MachineRegisterInfo &MRI) const { 3547 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3548 // Can't shrink instruction with three operands. 3549 // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add 3550 // a special case for it. It can only be shrunk if the third operand 3551 // is vcc, and src0_modifiers and src1_modifiers are not set. 3552 // We should handle this the same way we handle vopc, by addding 3553 // a register allocation hint pre-regalloc and then do the shrinking 3554 // post-regalloc. 3555 if (Src2) { 3556 switch (MI.getOpcode()) { 3557 default: return false; 3558 3559 case AMDGPU::V_ADDC_U32_e64: 3560 case AMDGPU::V_SUBB_U32_e64: 3561 case AMDGPU::V_SUBBREV_U32_e64: { 3562 const MachineOperand *Src1 3563 = getNamedOperand(MI, AMDGPU::OpName::src1); 3564 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 3565 return false; 3566 // Additional verification is needed for sdst/src2. 3567 return true; 3568 } 3569 case AMDGPU::V_MAC_F32_e64: 3570 case AMDGPU::V_MAC_F16_e64: 3571 case AMDGPU::V_FMAC_F32_e64: 3572 case AMDGPU::V_FMAC_F16_e64: 3573 case AMDGPU::V_FMAC_F64_e64: 3574 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 3575 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 3576 return false; 3577 break; 3578 3579 case AMDGPU::V_CNDMASK_B32_e64: 3580 break; 3581 } 3582 } 3583 3584 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3585 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 3586 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 3587 return false; 3588 3589 // We don't need to check src0, all input types are legal, so just make sure 3590 // src0 isn't using any modifiers. 3591 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 3592 return false; 3593 3594 // Can it be shrunk to a valid 32 bit opcode? 3595 if (!hasVALU32BitEncoding(MI.getOpcode())) 3596 return false; 3597 3598 // Check output modifiers 3599 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 3600 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 3601 } 3602 3603 // Set VCC operand with all flags from \p Orig, except for setting it as 3604 // implicit. 3605 static void copyFlagsToImplicitVCC(MachineInstr &MI, 3606 const MachineOperand &Orig) { 3607 3608 for (MachineOperand &Use : MI.implicit_operands()) { 3609 if (Use.isUse() && 3610 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { 3611 Use.setIsUndef(Orig.isUndef()); 3612 Use.setIsKill(Orig.isKill()); 3613 return; 3614 } 3615 } 3616 } 3617 3618 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 3619 unsigned Op32) const { 3620 MachineBasicBlock *MBB = MI.getParent();; 3621 MachineInstrBuilder Inst32 = 3622 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) 3623 .setMIFlags(MI.getFlags()); 3624 3625 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 3626 // For VOPC instructions, this is replaced by an implicit def of vcc. 3627 int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); 3628 if (Op32DstIdx != -1) { 3629 // dst 3630 Inst32.add(MI.getOperand(0)); 3631 } else { 3632 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 3633 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 3634 "Unexpected case"); 3635 } 3636 3637 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 3638 3639 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3640 if (Src1) 3641 Inst32.add(*Src1); 3642 3643 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3644 3645 if (Src2) { 3646 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 3647 if (Op32Src2Idx != -1) { 3648 Inst32.add(*Src2); 3649 } else { 3650 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 3651 // replaced with an implicit read of vcc or vcc_lo. The implicit read 3652 // of vcc was already added during the initial BuildMI, but we 3653 // 1) may need to change vcc to vcc_lo to preserve the original register 3654 // 2) have to preserve the original flags. 3655 fixImplicitOperands(*Inst32); 3656 copyFlagsToImplicitVCC(*Inst32, *Src2); 3657 } 3658 } 3659 3660 return Inst32; 3661 } 3662 3663 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 3664 const MachineOperand &MO, 3665 const MCOperandInfo &OpInfo) const { 3666 // Literal constants use the constant bus. 3667 //if (isLiteralConstantLike(MO, OpInfo)) 3668 // return true; 3669 if (MO.isImm()) 3670 return !isInlineConstant(MO, OpInfo); 3671 3672 if (!MO.isReg()) 3673 return true; // Misc other operands like FrameIndex 3674 3675 if (!MO.isUse()) 3676 return false; 3677 3678 if (MO.getReg().isVirtual()) 3679 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 3680 3681 // Null is free 3682 if (MO.getReg() == AMDGPU::SGPR_NULL) 3683 return false; 3684 3685 // SGPRs use the constant bus 3686 if (MO.isImplicit()) { 3687 return MO.getReg() == AMDGPU::M0 || 3688 MO.getReg() == AMDGPU::VCC || 3689 MO.getReg() == AMDGPU::VCC_LO; 3690 } else { 3691 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 3692 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 3693 } 3694 } 3695 3696 static Register findImplicitSGPRRead(const MachineInstr &MI) { 3697 for (const MachineOperand &MO : MI.implicit_operands()) { 3698 // We only care about reads. 3699 if (MO.isDef()) 3700 continue; 3701 3702 switch (MO.getReg()) { 3703 case AMDGPU::VCC: 3704 case AMDGPU::VCC_LO: 3705 case AMDGPU::VCC_HI: 3706 case AMDGPU::M0: 3707 case AMDGPU::FLAT_SCR: 3708 return MO.getReg(); 3709 3710 default: 3711 break; 3712 } 3713 } 3714 3715 return AMDGPU::NoRegister; 3716 } 3717 3718 static bool shouldReadExec(const MachineInstr &MI) { 3719 if (SIInstrInfo::isVALU(MI)) { 3720 switch (MI.getOpcode()) { 3721 case AMDGPU::V_READLANE_B32: 3722 case AMDGPU::V_WRITELANE_B32: 3723 return false; 3724 } 3725 3726 return true; 3727 } 3728 3729 if (MI.isPreISelOpcode() || 3730 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 3731 SIInstrInfo::isSALU(MI) || 3732 SIInstrInfo::isSMRD(MI)) 3733 return false; 3734 3735 return true; 3736 } 3737 3738 static bool isSubRegOf(const SIRegisterInfo &TRI, 3739 const MachineOperand &SuperVec, 3740 const MachineOperand &SubReg) { 3741 if (SubReg.getReg().isPhysical()) 3742 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 3743 3744 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 3745 SubReg.getReg() == SuperVec.getReg(); 3746 } 3747 3748 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 3749 StringRef &ErrInfo) const { 3750 uint16_t Opcode = MI.getOpcode(); 3751 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 3752 return true; 3753 3754 const MachineFunction *MF = MI.getParent()->getParent(); 3755 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3756 3757 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 3758 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 3759 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 3760 3761 // Make sure the number of operands is correct. 3762 const MCInstrDesc &Desc = get(Opcode); 3763 if (!Desc.isVariadic() && 3764 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 3765 ErrInfo = "Instruction has wrong number of operands."; 3766 return false; 3767 } 3768 3769 if (MI.isInlineAsm()) { 3770 // Verify register classes for inlineasm constraints. 3771 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 3772 I != E; ++I) { 3773 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 3774 if (!RC) 3775 continue; 3776 3777 const MachineOperand &Op = MI.getOperand(I); 3778 if (!Op.isReg()) 3779 continue; 3780 3781 Register Reg = Op.getReg(); 3782 if (!Reg.isVirtual() && !RC->contains(Reg)) { 3783 ErrInfo = "inlineasm operand has incorrect register class."; 3784 return false; 3785 } 3786 } 3787 3788 return true; 3789 } 3790 3791 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { 3792 ErrInfo = "missing memory operand from MIMG instruction."; 3793 return false; 3794 } 3795 3796 // Make sure the register classes are correct. 3797 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 3798 const MachineOperand &MO = MI.getOperand(i); 3799 if (MO.isFPImm()) { 3800 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 3801 "all fp values to integers."; 3802 return false; 3803 } 3804 3805 int RegClass = Desc.OpInfo[i].RegClass; 3806 3807 switch (Desc.OpInfo[i].OperandType) { 3808 case MCOI::OPERAND_REGISTER: 3809 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 3810 ErrInfo = "Illegal immediate value for operand."; 3811 return false; 3812 } 3813 break; 3814 case AMDGPU::OPERAND_REG_IMM_INT32: 3815 case AMDGPU::OPERAND_REG_IMM_FP32: 3816 break; 3817 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3818 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3819 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3820 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3821 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3822 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3823 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3824 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 3825 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3826 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: 3827 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { 3828 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 3829 ErrInfo = "Illegal immediate value for operand."; 3830 return false; 3831 } 3832 break; 3833 } 3834 case MCOI::OPERAND_IMMEDIATE: 3835 case AMDGPU::OPERAND_KIMM32: 3836 // Check if this operand is an immediate. 3837 // FrameIndex operands will be replaced by immediates, so they are 3838 // allowed. 3839 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 3840 ErrInfo = "Expected immediate, but got non-immediate"; 3841 return false; 3842 } 3843 LLVM_FALLTHROUGH; 3844 default: 3845 continue; 3846 } 3847 3848 if (!MO.isReg()) 3849 continue; 3850 Register Reg = MO.getReg(); 3851 if (!Reg) 3852 continue; 3853 3854 // FIXME: Ideally we would have separate instruction definitions with the 3855 // aligned register constraint. 3856 // FIXME: We do not verify inline asm operands, but custom inline asm 3857 // verification is broken anyway 3858 if (ST.needsAlignedVGPRs()) { 3859 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); 3860 const bool IsVGPR = RI.hasVGPRs(RC); 3861 const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC); 3862 if ((IsVGPR || IsAGPR) && MO.getSubReg()) { 3863 const TargetRegisterClass *SubRC = 3864 RI.getSubRegClass(RC, MO.getSubReg()); 3865 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); 3866 if (RC) 3867 RC = SubRC; 3868 } 3869 3870 // Check that this is the aligned version of the class. 3871 if (!RC || !RI.isProperlyAlignedRC(*RC)) { 3872 ErrInfo = "Subtarget requires even aligned vector registers"; 3873 return false; 3874 } 3875 } 3876 3877 if (RegClass != -1) { 3878 if (Reg.isVirtual()) 3879 continue; 3880 3881 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 3882 if (!RC->contains(Reg)) { 3883 ErrInfo = "Operand has incorrect register class."; 3884 return false; 3885 } 3886 } 3887 } 3888 3889 // Verify SDWA 3890 if (isSDWA(MI)) { 3891 if (!ST.hasSDWA()) { 3892 ErrInfo = "SDWA is not supported on this target"; 3893 return false; 3894 } 3895 3896 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 3897 3898 const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx }; 3899 3900 for (int OpIdx: OpIndicies) { 3901 if (OpIdx == -1) 3902 continue; 3903 const MachineOperand &MO = MI.getOperand(OpIdx); 3904 3905 if (!ST.hasSDWAScalar()) { 3906 // Only VGPRS on VI 3907 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 3908 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 3909 return false; 3910 } 3911 } else { 3912 // No immediates on GFX9 3913 if (!MO.isReg()) { 3914 ErrInfo = 3915 "Only reg allowed as operands in SDWA instructions on GFX9+"; 3916 return false; 3917 } 3918 } 3919 } 3920 3921 if (!ST.hasSDWAOmod()) { 3922 // No omod allowed on VI 3923 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3924 if (OMod != nullptr && 3925 (!OMod->isImm() || OMod->getImm() != 0)) { 3926 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 3927 return false; 3928 } 3929 } 3930 3931 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 3932 if (isVOPC(BasicOpcode)) { 3933 if (!ST.hasSDWASdst() && DstIdx != -1) { 3934 // Only vcc allowed as dst on VI for VOPC 3935 const MachineOperand &Dst = MI.getOperand(DstIdx); 3936 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 3937 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 3938 return false; 3939 } 3940 } else if (!ST.hasSDWAOutModsVOPC()) { 3941 // No clamp allowed on GFX9 for VOPC 3942 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3943 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 3944 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 3945 return false; 3946 } 3947 3948 // No omod allowed on GFX9 for VOPC 3949 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 3950 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 3951 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 3952 return false; 3953 } 3954 } 3955 } 3956 3957 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 3958 if (DstUnused && DstUnused->isImm() && 3959 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 3960 const MachineOperand &Dst = MI.getOperand(DstIdx); 3961 if (!Dst.isReg() || !Dst.isTied()) { 3962 ErrInfo = "Dst register should have tied register"; 3963 return false; 3964 } 3965 3966 const MachineOperand &TiedMO = 3967 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 3968 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 3969 ErrInfo = 3970 "Dst register should be tied to implicit use of preserved register"; 3971 return false; 3972 } else if (TiedMO.getReg().isPhysical() && 3973 Dst.getReg() != TiedMO.getReg()) { 3974 ErrInfo = "Dst register should use same physical register as preserved"; 3975 return false; 3976 } 3977 } 3978 } 3979 3980 // Verify MIMG 3981 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 3982 // Ensure that the return type used is large enough for all the options 3983 // being used TFE/LWE require an extra result register. 3984 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 3985 if (DMask) { 3986 uint64_t DMaskImm = DMask->getImm(); 3987 uint32_t RegCount = 3988 isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); 3989 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 3990 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 3991 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 3992 3993 // Adjust for packed 16 bit values 3994 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 3995 RegCount >>= 1; 3996 3997 // Adjust if using LWE or TFE 3998 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 3999 RegCount += 1; 4000 4001 const uint32_t DstIdx = 4002 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 4003 const MachineOperand &Dst = MI.getOperand(DstIdx); 4004 if (Dst.isReg()) { 4005 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 4006 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 4007 if (RegCount > DstSize) { 4008 ErrInfo = "MIMG instruction returns too many registers for dst " 4009 "register class"; 4010 return false; 4011 } 4012 } 4013 } 4014 } 4015 4016 // Verify VOP*. Ignore multiple sgpr operands on writelane. 4017 if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 4018 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { 4019 // Only look at the true operands. Only a real operand can use the constant 4020 // bus, and we don't want to check pseudo-operands like the source modifier 4021 // flags. 4022 const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; 4023 4024 unsigned ConstantBusCount = 0; 4025 bool UsesLiteral = false; 4026 const MachineOperand *LiteralVal = nullptr; 4027 4028 if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) 4029 ++ConstantBusCount; 4030 4031 SmallVector<Register, 2> SGPRsUsed; 4032 Register SGPRUsed; 4033 4034 for (int OpIdx : OpIndices) { 4035 if (OpIdx == -1) 4036 break; 4037 const MachineOperand &MO = MI.getOperand(OpIdx); 4038 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 4039 if (MO.isReg()) { 4040 SGPRUsed = MO.getReg(); 4041 if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) { 4042 return SGPRUsed != SGPR; 4043 })) { 4044 ++ConstantBusCount; 4045 SGPRsUsed.push_back(SGPRUsed); 4046 } 4047 } else { 4048 if (!UsesLiteral) { 4049 ++ConstantBusCount; 4050 UsesLiteral = true; 4051 LiteralVal = &MO; 4052 } else if (!MO.isIdenticalTo(*LiteralVal)) { 4053 assert(isVOP3(MI)); 4054 ErrInfo = "VOP3 instruction uses more than one literal"; 4055 return false; 4056 } 4057 } 4058 } 4059 } 4060 4061 SGPRUsed = findImplicitSGPRRead(MI); 4062 if (SGPRUsed != AMDGPU::NoRegister) { 4063 // Implicit uses may safely overlap true overands 4064 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 4065 return !RI.regsOverlap(SGPRUsed, SGPR); 4066 })) { 4067 ++ConstantBusCount; 4068 SGPRsUsed.push_back(SGPRUsed); 4069 } 4070 } 4071 4072 // v_writelane_b32 is an exception from constant bus restriction: 4073 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 4074 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 4075 Opcode != AMDGPU::V_WRITELANE_B32) { 4076 ErrInfo = "VOP* instruction violates constant bus restriction"; 4077 return false; 4078 } 4079 4080 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { 4081 ErrInfo = "VOP3 instruction uses literal"; 4082 return false; 4083 } 4084 } 4085 4086 // Special case for writelane - this can break the multiple constant bus rule, 4087 // but still can't use more than one SGPR register 4088 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 4089 unsigned SGPRCount = 0; 4090 Register SGPRUsed = AMDGPU::NoRegister; 4091 4092 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { 4093 if (OpIdx == -1) 4094 break; 4095 4096 const MachineOperand &MO = MI.getOperand(OpIdx); 4097 4098 if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { 4099 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 4100 if (MO.getReg() != SGPRUsed) 4101 ++SGPRCount; 4102 SGPRUsed = MO.getReg(); 4103 } 4104 } 4105 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 4106 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 4107 return false; 4108 } 4109 } 4110 } 4111 4112 // Verify misc. restrictions on specific instructions. 4113 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || 4114 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { 4115 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4116 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 4117 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 4118 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 4119 if (!compareMachineOp(Src0, Src1) && 4120 !compareMachineOp(Src0, Src2)) { 4121 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 4122 return false; 4123 } 4124 } 4125 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 4126 SISrcMods::ABS) || 4127 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & 4128 SISrcMods::ABS) || 4129 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 4130 SISrcMods::ABS)) { 4131 ErrInfo = "ABS not allowed in VOP3B instructions"; 4132 return false; 4133 } 4134 } 4135 4136 if (isSOP2(MI) || isSOPC(MI)) { 4137 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4138 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 4139 unsigned Immediates = 0; 4140 4141 if (!Src0.isReg() && 4142 !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType)) 4143 Immediates++; 4144 if (!Src1.isReg() && 4145 !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType)) 4146 Immediates++; 4147 4148 if (Immediates > 1) { 4149 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 4150 return false; 4151 } 4152 } 4153 4154 if (isSOPK(MI)) { 4155 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 4156 if (Desc.isBranch()) { 4157 if (!Op->isMBB()) { 4158 ErrInfo = "invalid branch target for SOPK instruction"; 4159 return false; 4160 } 4161 } else { 4162 uint64_t Imm = Op->getImm(); 4163 if (sopkIsZext(MI)) { 4164 if (!isUInt<16>(Imm)) { 4165 ErrInfo = "invalid immediate for SOPK instruction"; 4166 return false; 4167 } 4168 } else { 4169 if (!isInt<16>(Imm)) { 4170 ErrInfo = "invalid immediate for SOPK instruction"; 4171 return false; 4172 } 4173 } 4174 } 4175 } 4176 4177 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 4178 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 4179 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 4180 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 4181 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 4182 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 4183 4184 const unsigned StaticNumOps = Desc.getNumOperands() + 4185 Desc.getNumImplicitUses(); 4186 const unsigned NumImplicitOps = IsDst ? 2 : 1; 4187 4188 // Allow additional implicit operands. This allows a fixup done by the post 4189 // RA scheduler where the main implicit operand is killed and implicit-defs 4190 // are added for sub-registers that remain live after this instruction. 4191 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 4192 ErrInfo = "missing implicit register operands"; 4193 return false; 4194 } 4195 4196 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 4197 if (IsDst) { 4198 if (!Dst->isUse()) { 4199 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 4200 return false; 4201 } 4202 4203 unsigned UseOpIdx; 4204 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 4205 UseOpIdx != StaticNumOps + 1) { 4206 ErrInfo = "movrel implicit operands should be tied"; 4207 return false; 4208 } 4209 } 4210 4211 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4212 const MachineOperand &ImpUse 4213 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 4214 if (!ImpUse.isReg() || !ImpUse.isUse() || 4215 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 4216 ErrInfo = "src0 should be subreg of implicit vector use"; 4217 return false; 4218 } 4219 } 4220 4221 // Make sure we aren't losing exec uses in the td files. This mostly requires 4222 // being careful when using let Uses to try to add other use registers. 4223 if (shouldReadExec(MI)) { 4224 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 4225 ErrInfo = "VALU instruction does not implicitly read exec mask"; 4226 return false; 4227 } 4228 } 4229 4230 if (isSMRD(MI)) { 4231 if (MI.mayStore()) { 4232 // The register offset form of scalar stores may only use m0 as the 4233 // soffset register. 4234 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); 4235 if (Soff && Soff->getReg() != AMDGPU::M0) { 4236 ErrInfo = "scalar stores must use m0 as offset register"; 4237 return false; 4238 } 4239 } 4240 } 4241 4242 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { 4243 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 4244 if (Offset->getImm() != 0) { 4245 ErrInfo = "subtarget does not support offsets in flat instructions"; 4246 return false; 4247 } 4248 } 4249 4250 if (isMIMG(MI)) { 4251 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 4252 if (DimOp) { 4253 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 4254 AMDGPU::OpName::vaddr0); 4255 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 4256 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 4257 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4258 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 4259 const AMDGPU::MIMGDimInfo *Dim = 4260 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 4261 4262 if (!Dim) { 4263 ErrInfo = "dim is out of range"; 4264 return false; 4265 } 4266 4267 bool IsA16 = false; 4268 if (ST.hasR128A16()) { 4269 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 4270 IsA16 = R128A16->getImm() != 0; 4271 } else if (ST.hasGFX10A16()) { 4272 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 4273 IsA16 = A16->getImm() != 0; 4274 } 4275 4276 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 4277 4278 unsigned AddrWords = 4279 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); 4280 4281 unsigned VAddrWords; 4282 if (IsNSA) { 4283 VAddrWords = SRsrcIdx - VAddr0Idx; 4284 } else { 4285 const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); 4286 VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; 4287 if (AddrWords > 8) 4288 AddrWords = 16; 4289 } 4290 4291 if (VAddrWords != AddrWords) { 4292 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords 4293 << " but got " << VAddrWords << "\n"); 4294 ErrInfo = "bad vaddr size"; 4295 return false; 4296 } 4297 } 4298 } 4299 4300 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 4301 if (DppCt) { 4302 using namespace AMDGPU::DPP; 4303 4304 unsigned DC = DppCt->getImm(); 4305 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 4306 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 4307 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 4308 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 4309 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 4310 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 4311 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 4312 ErrInfo = "Invalid dpp_ctrl value"; 4313 return false; 4314 } 4315 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 4316 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4317 ErrInfo = "Invalid dpp_ctrl value: " 4318 "wavefront shifts are not supported on GFX10+"; 4319 return false; 4320 } 4321 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 4322 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4323 ErrInfo = "Invalid dpp_ctrl value: " 4324 "broadcasts are not supported on GFX10+"; 4325 return false; 4326 } 4327 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 4328 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 4329 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && 4330 DC <= DppCtrl::ROW_NEWBCAST_LAST && 4331 !ST.hasGFX90AInsts()) { 4332 ErrInfo = "Invalid dpp_ctrl value: " 4333 "row_newbroadcast/row_share is not supported before " 4334 "GFX90A/GFX10"; 4335 return false; 4336 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { 4337 ErrInfo = "Invalid dpp_ctrl value: " 4338 "row_share and row_xmask are not supported before GFX10"; 4339 return false; 4340 } 4341 } 4342 4343 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 4344 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 4345 4346 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && 4347 ((DstIdx >= 0 && 4348 (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || 4349 Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) || 4350 ((Src0Idx >= 0 && 4351 (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || 4352 Desc.OpInfo[Src0Idx].RegClass == 4353 AMDGPU::VReg_64_Align2RegClassID)))) && 4354 !AMDGPU::isLegal64BitDPPControl(DC)) { 4355 ErrInfo = "Invalid dpp_ctrl value: " 4356 "64 bit dpp only support row_newbcast"; 4357 return false; 4358 } 4359 } 4360 4361 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { 4362 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 4363 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 4364 : AMDGPU::OpName::vdata; 4365 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); 4366 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); 4367 if (Data && !Data->isReg()) 4368 Data = nullptr; 4369 4370 if (ST.hasGFX90AInsts()) { 4371 if (Dst && Data && 4372 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { 4373 ErrInfo = "Invalid register class: " 4374 "vdata and vdst should be both VGPR or AGPR"; 4375 return false; 4376 } 4377 if (Data && Data2 && 4378 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { 4379 ErrInfo = "Invalid register class: " 4380 "both data operands should be VGPR or AGPR"; 4381 return false; 4382 } 4383 } else { 4384 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || 4385 (Data && RI.isAGPR(MRI, Data->getReg())) || 4386 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { 4387 ErrInfo = "Invalid register class: " 4388 "agpr loads and stores not supported on this GPU"; 4389 return false; 4390 } 4391 } 4392 } 4393 4394 if (ST.needsAlignedVGPRs() && 4395 (MI.getOpcode() == AMDGPU::DS_GWS_INIT || 4396 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || 4397 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) { 4398 const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0); 4399 Register Reg = Op->getReg(); 4400 bool Aligned = true; 4401 if (Reg.isPhysical()) { 4402 Aligned = !(RI.getHWRegIndex(Reg) & 1); 4403 } else { 4404 const TargetRegisterClass &RC = *MRI.getRegClass(Reg); 4405 Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && 4406 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); 4407 } 4408 4409 if (!Aligned) { 4410 ErrInfo = "Subtarget requires even aligned vector registers " 4411 "for DS_GWS instructions"; 4412 return false; 4413 } 4414 } 4415 4416 return true; 4417 } 4418 4419 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 4420 switch (MI.getOpcode()) { 4421 default: return AMDGPU::INSTRUCTION_LIST_END; 4422 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 4423 case AMDGPU::COPY: return AMDGPU::COPY; 4424 case AMDGPU::PHI: return AMDGPU::PHI; 4425 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 4426 case AMDGPU::WQM: return AMDGPU::WQM; 4427 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 4428 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; 4429 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; 4430 case AMDGPU::S_MOV_B32: { 4431 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4432 return MI.getOperand(1).isReg() || 4433 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 4434 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 4435 } 4436 case AMDGPU::S_ADD_I32: 4437 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; 4438 case AMDGPU::S_ADDC_U32: 4439 return AMDGPU::V_ADDC_U32_e32; 4440 case AMDGPU::S_SUB_I32: 4441 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; 4442 // FIXME: These are not consistently handled, and selected when the carry is 4443 // used. 4444 case AMDGPU::S_ADD_U32: 4445 return AMDGPU::V_ADD_CO_U32_e32; 4446 case AMDGPU::S_SUB_U32: 4447 return AMDGPU::V_SUB_CO_U32_e32; 4448 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 4449 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; 4450 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; 4451 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; 4452 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 4453 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 4454 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 4455 case AMDGPU::S_XNOR_B32: 4456 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 4457 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 4458 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 4459 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 4460 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 4461 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 4462 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; 4463 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 4464 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; 4465 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 4466 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; 4467 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; 4468 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; 4469 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; 4470 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; 4471 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 4472 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 4473 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 4474 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 4475 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; 4476 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; 4477 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; 4478 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; 4479 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; 4480 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; 4481 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; 4482 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; 4483 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; 4484 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; 4485 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; 4486 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; 4487 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; 4488 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; 4489 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 4490 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 4491 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 4492 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 4493 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 4494 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 4495 } 4496 llvm_unreachable( 4497 "Unexpected scalar opcode without corresponding vector one!"); 4498 } 4499 4500 static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, 4501 const MachineRegisterInfo &MRI, 4502 const MCInstrDesc &TID, 4503 unsigned RCID, 4504 bool IsAllocatable) { 4505 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 4506 (TID.mayLoad() || TID.mayStore() || 4507 (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { 4508 switch (RCID) { 4509 case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; 4510 case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; 4511 case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID; 4512 case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID; 4513 case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID; 4514 default: 4515 break; 4516 } 4517 } 4518 return RCID; 4519 } 4520 4521 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, 4522 unsigned OpNum, const TargetRegisterInfo *TRI, 4523 const MachineFunction &MF) 4524 const { 4525 if (OpNum >= TID.getNumOperands()) 4526 return nullptr; 4527 auto RegClass = TID.OpInfo[OpNum].RegClass; 4528 bool IsAllocatable = false; 4529 if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { 4530 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions 4531 // with two data operands. Request register class constainted to VGPR only 4532 // of both operands present as Machine Copy Propagation can not check this 4533 // constraint and possibly other passes too. 4534 // 4535 // The check is limited to FLAT and DS because atomics in non-flat encoding 4536 // have their vdst and vdata tied to be the same register. 4537 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 4538 AMDGPU::OpName::vdst); 4539 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 4540 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 4541 : AMDGPU::OpName::vdata); 4542 if (DataIdx != -1) { 4543 IsAllocatable = VDstIdx != -1 || 4544 AMDGPU::getNamedOperandIdx(TID.Opcode, 4545 AMDGPU::OpName::data1) != -1; 4546 } 4547 } 4548 RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass, 4549 IsAllocatable); 4550 return RI.getRegClass(RegClass); 4551 } 4552 4553 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 4554 unsigned OpNo) const { 4555 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4556 const MCInstrDesc &Desc = get(MI.getOpcode()); 4557 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 4558 Desc.OpInfo[OpNo].RegClass == -1) { 4559 Register Reg = MI.getOperand(OpNo).getReg(); 4560 4561 if (Reg.isVirtual()) 4562 return MRI.getRegClass(Reg); 4563 return RI.getPhysRegClass(Reg); 4564 } 4565 4566 unsigned RCID = Desc.OpInfo[OpNo].RegClass; 4567 RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true); 4568 return RI.getRegClass(RCID); 4569 } 4570 4571 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 4572 MachineBasicBlock::iterator I = MI; 4573 MachineBasicBlock *MBB = MI.getParent(); 4574 MachineOperand &MO = MI.getOperand(OpIdx); 4575 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 4576 unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; 4577 const TargetRegisterClass *RC = RI.getRegClass(RCID); 4578 unsigned Size = RI.getRegSizeInBits(*RC); 4579 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 4580 if (MO.isReg()) 4581 Opcode = AMDGPU::COPY; 4582 else if (RI.isSGPRClass(RC)) 4583 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 4584 4585 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 4586 const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); 4587 if (RI.getCommonSubClass(VRC64, VRC)) 4588 VRC = VRC64; 4589 else 4590 VRC = &AMDGPU::VGPR_32RegClass; 4591 4592 Register Reg = MRI.createVirtualRegister(VRC); 4593 DebugLoc DL = MBB->findDebugLoc(I); 4594 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 4595 MO.ChangeToRegister(Reg, false); 4596 } 4597 4598 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 4599 MachineRegisterInfo &MRI, 4600 MachineOperand &SuperReg, 4601 const TargetRegisterClass *SuperRC, 4602 unsigned SubIdx, 4603 const TargetRegisterClass *SubRC) 4604 const { 4605 MachineBasicBlock *MBB = MI->getParent(); 4606 DebugLoc DL = MI->getDebugLoc(); 4607 Register SubReg = MRI.createVirtualRegister(SubRC); 4608 4609 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 4610 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4611 .addReg(SuperReg.getReg(), 0, SubIdx); 4612 return SubReg; 4613 } 4614 4615 // Just in case the super register is itself a sub-register, copy it to a new 4616 // value so we don't need to worry about merging its subreg index with the 4617 // SubIdx passed to this function. The register coalescer should be able to 4618 // eliminate this extra copy. 4619 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 4620 4621 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 4622 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 4623 4624 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 4625 .addReg(NewSuperReg, 0, SubIdx); 4626 4627 return SubReg; 4628 } 4629 4630 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 4631 MachineBasicBlock::iterator MII, 4632 MachineRegisterInfo &MRI, 4633 MachineOperand &Op, 4634 const TargetRegisterClass *SuperRC, 4635 unsigned SubIdx, 4636 const TargetRegisterClass *SubRC) const { 4637 if (Op.isImm()) { 4638 if (SubIdx == AMDGPU::sub0) 4639 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 4640 if (SubIdx == AMDGPU::sub1) 4641 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 4642 4643 llvm_unreachable("Unhandled register index for immediate"); 4644 } 4645 4646 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 4647 SubIdx, SubRC); 4648 return MachineOperand::CreateReg(SubReg, false); 4649 } 4650 4651 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 4652 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 4653 assert(Inst.getNumExplicitOperands() == 3); 4654 MachineOperand Op1 = Inst.getOperand(1); 4655 Inst.RemoveOperand(1); 4656 Inst.addOperand(Op1); 4657 } 4658 4659 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 4660 const MCOperandInfo &OpInfo, 4661 const MachineOperand &MO) const { 4662 if (!MO.isReg()) 4663 return false; 4664 4665 Register Reg = MO.getReg(); 4666 4667 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 4668 if (Reg.isPhysical()) 4669 return DRC->contains(Reg); 4670 4671 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 4672 4673 if (MO.getSubReg()) { 4674 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 4675 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 4676 if (!SuperRC) 4677 return false; 4678 4679 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 4680 if (!DRC) 4681 return false; 4682 } 4683 return RC->hasSuperClassEq(DRC); 4684 } 4685 4686 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 4687 const MCOperandInfo &OpInfo, 4688 const MachineOperand &MO) const { 4689 if (MO.isReg()) 4690 return isLegalRegOperand(MRI, OpInfo, MO); 4691 4692 // Handle non-register types that are treated like immediates. 4693 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 4694 return true; 4695 } 4696 4697 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 4698 const MachineOperand *MO) const { 4699 const MachineFunction &MF = *MI.getParent()->getParent(); 4700 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4701 const MCInstrDesc &InstDesc = MI.getDesc(); 4702 const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; 4703 const TargetRegisterClass *DefinedRC = 4704 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 4705 if (!MO) 4706 MO = &MI.getOperand(OpIdx); 4707 4708 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 4709 int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4710 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 4711 if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) 4712 return false; 4713 4714 SmallDenseSet<RegSubRegPair> SGPRsUsed; 4715 if (MO->isReg()) 4716 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 4717 4718 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 4719 if (i == OpIdx) 4720 continue; 4721 const MachineOperand &Op = MI.getOperand(i); 4722 if (Op.isReg()) { 4723 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 4724 if (!SGPRsUsed.count(SGPR) && 4725 usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { 4726 if (--ConstantBusLimit <= 0) 4727 return false; 4728 SGPRsUsed.insert(SGPR); 4729 } 4730 } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { 4731 if (--ConstantBusLimit <= 0) 4732 return false; 4733 } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && 4734 isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { 4735 if (!VOP3LiteralLimit--) 4736 return false; 4737 if (--ConstantBusLimit <= 0) 4738 return false; 4739 } 4740 } 4741 } 4742 4743 if (MO->isReg()) { 4744 assert(DefinedRC); 4745 if (!isLegalRegOperand(MRI, OpInfo, *MO)) 4746 return false; 4747 bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); 4748 if (IsAGPR && !ST.hasMAIInsts()) 4749 return false; 4750 unsigned Opc = MI.getOpcode(); 4751 if (IsAGPR && 4752 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 4753 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) 4754 return false; 4755 // Atomics should have both vdst and vdata either vgpr or agpr. 4756 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 4757 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, 4758 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); 4759 if ((int)OpIdx == VDstIdx && DataIdx != -1 && 4760 MI.getOperand(DataIdx).isReg() && 4761 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) 4762 return false; 4763 if ((int)OpIdx == DataIdx) { 4764 if (VDstIdx != -1 && 4765 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) 4766 return false; 4767 // DS instructions with 2 src operands also must have tied RC. 4768 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, 4769 AMDGPU::OpName::data1); 4770 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && 4771 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) 4772 return false; 4773 } 4774 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 4775 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && 4776 RI.isSGPRReg(MRI, MO->getReg())) 4777 return false; 4778 return true; 4779 } 4780 4781 // Handle non-register types that are treated like immediates. 4782 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 4783 4784 if (!DefinedRC) { 4785 // This operand expects an immediate. 4786 return true; 4787 } 4788 4789 return isImmOperandLegal(MI, OpIdx, *MO); 4790 } 4791 4792 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 4793 MachineInstr &MI) const { 4794 unsigned Opc = MI.getOpcode(); 4795 const MCInstrDesc &InstrDesc = get(Opc); 4796 4797 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 4798 MachineOperand &Src0 = MI.getOperand(Src0Idx); 4799 4800 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 4801 MachineOperand &Src1 = MI.getOperand(Src1Idx); 4802 4803 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 4804 // we need to only have one constant bus use before GFX10. 4805 bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; 4806 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && 4807 Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) || 4808 isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx]))) 4809 legalizeOpWithMove(MI, Src0Idx); 4810 4811 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 4812 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 4813 // src0/src1 with V_READFIRSTLANE. 4814 if (Opc == AMDGPU::V_WRITELANE_B32) { 4815 const DebugLoc &DL = MI.getDebugLoc(); 4816 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 4817 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4818 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4819 .add(Src0); 4820 Src0.ChangeToRegister(Reg, false); 4821 } 4822 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 4823 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4824 const DebugLoc &DL = MI.getDebugLoc(); 4825 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4826 .add(Src1); 4827 Src1.ChangeToRegister(Reg, false); 4828 } 4829 return; 4830 } 4831 4832 // No VOP2 instructions support AGPRs. 4833 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 4834 legalizeOpWithMove(MI, Src0Idx); 4835 4836 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 4837 legalizeOpWithMove(MI, Src1Idx); 4838 4839 // VOP2 src0 instructions support all operand types, so we don't need to check 4840 // their legality. If src1 is already legal, we don't need to do anything. 4841 if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) 4842 return; 4843 4844 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 4845 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 4846 // select is uniform. 4847 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 4848 RI.isVGPR(MRI, Src1.getReg())) { 4849 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4850 const DebugLoc &DL = MI.getDebugLoc(); 4851 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4852 .add(Src1); 4853 Src1.ChangeToRegister(Reg, false); 4854 return; 4855 } 4856 4857 // We do not use commuteInstruction here because it is too aggressive and will 4858 // commute if it is possible. We only want to commute here if it improves 4859 // legality. This can be called a fairly large number of times so don't waste 4860 // compile time pointlessly swapping and checking legality again. 4861 if (HasImplicitSGPR || !MI.isCommutable()) { 4862 legalizeOpWithMove(MI, Src1Idx); 4863 return; 4864 } 4865 4866 // If src0 can be used as src1, commuting will make the operands legal. 4867 // Otherwise we have to give up and insert a move. 4868 // 4869 // TODO: Other immediate-like operand kinds could be commuted if there was a 4870 // MachineOperand::ChangeTo* for them. 4871 if ((!Src1.isImm() && !Src1.isReg()) || 4872 !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { 4873 legalizeOpWithMove(MI, Src1Idx); 4874 return; 4875 } 4876 4877 int CommutedOpc = commuteOpcode(MI); 4878 if (CommutedOpc == -1) { 4879 legalizeOpWithMove(MI, Src1Idx); 4880 return; 4881 } 4882 4883 MI.setDesc(get(CommutedOpc)); 4884 4885 Register Src0Reg = Src0.getReg(); 4886 unsigned Src0SubReg = Src0.getSubReg(); 4887 bool Src0Kill = Src0.isKill(); 4888 4889 if (Src1.isImm()) 4890 Src0.ChangeToImmediate(Src1.getImm()); 4891 else if (Src1.isReg()) { 4892 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 4893 Src0.setSubReg(Src1.getSubReg()); 4894 } else 4895 llvm_unreachable("Should only have register or immediate operands"); 4896 4897 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 4898 Src1.setSubReg(Src0SubReg); 4899 fixImplicitOperands(MI); 4900 } 4901 4902 // Legalize VOP3 operands. All operand types are supported for any operand 4903 // but only one literal constant and only starting from GFX10. 4904 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 4905 MachineInstr &MI) const { 4906 unsigned Opc = MI.getOpcode(); 4907 4908 int VOP3Idx[3] = { 4909 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 4910 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 4911 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 4912 }; 4913 4914 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || 4915 Opc == AMDGPU::V_PERMLANEX16_B32_e64) { 4916 // src1 and src2 must be scalar 4917 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 4918 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 4919 const DebugLoc &DL = MI.getDebugLoc(); 4920 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 4921 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4922 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4923 .add(Src1); 4924 Src1.ChangeToRegister(Reg, false); 4925 } 4926 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 4927 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 4928 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 4929 .add(Src2); 4930 Src2.ChangeToRegister(Reg, false); 4931 } 4932 } 4933 4934 // Find the one SGPR operand we are allowed to use. 4935 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 4936 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 4937 SmallDenseSet<unsigned> SGPRsUsed; 4938 Register SGPRReg = findUsedSGPR(MI, VOP3Idx); 4939 if (SGPRReg != AMDGPU::NoRegister) { 4940 SGPRsUsed.insert(SGPRReg); 4941 --ConstantBusLimit; 4942 } 4943 4944 for (unsigned i = 0; i < 3; ++i) { 4945 int Idx = VOP3Idx[i]; 4946 if (Idx == -1) 4947 break; 4948 MachineOperand &MO = MI.getOperand(Idx); 4949 4950 if (!MO.isReg()) { 4951 if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) 4952 continue; 4953 4954 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 4955 --LiteralLimit; 4956 --ConstantBusLimit; 4957 continue; 4958 } 4959 4960 --LiteralLimit; 4961 --ConstantBusLimit; 4962 legalizeOpWithMove(MI, Idx); 4963 continue; 4964 } 4965 4966 if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) && 4967 !isOperandLegal(MI, Idx, &MO)) { 4968 legalizeOpWithMove(MI, Idx); 4969 continue; 4970 } 4971 4972 if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) 4973 continue; // VGPRs are legal 4974 4975 // We can use one SGPR in each VOP3 instruction prior to GFX10 4976 // and two starting from GFX10. 4977 if (SGPRsUsed.count(MO.getReg())) 4978 continue; 4979 if (ConstantBusLimit > 0) { 4980 SGPRsUsed.insert(MO.getReg()); 4981 --ConstantBusLimit; 4982 continue; 4983 } 4984 4985 // If we make it this far, then the operand is not legal and we must 4986 // legalize it. 4987 legalizeOpWithMove(MI, Idx); 4988 } 4989 } 4990 4991 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 4992 MachineRegisterInfo &MRI) const { 4993 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 4994 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 4995 Register DstReg = MRI.createVirtualRegister(SRC); 4996 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 4997 4998 if (RI.hasAGPRs(VRC)) { 4999 VRC = RI.getEquivalentVGPRClass(VRC); 5000 Register NewSrcReg = MRI.createVirtualRegister(VRC); 5001 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5002 get(TargetOpcode::COPY), NewSrcReg) 5003 .addReg(SrcReg); 5004 SrcReg = NewSrcReg; 5005 } 5006 5007 if (SubRegs == 1) { 5008 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5009 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 5010 .addReg(SrcReg); 5011 return DstReg; 5012 } 5013 5014 SmallVector<unsigned, 8> SRegs; 5015 for (unsigned i = 0; i < SubRegs; ++i) { 5016 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5017 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5018 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 5019 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 5020 SRegs.push_back(SGPR); 5021 } 5022 5023 MachineInstrBuilder MIB = 5024 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5025 get(AMDGPU::REG_SEQUENCE), DstReg); 5026 for (unsigned i = 0; i < SubRegs; ++i) { 5027 MIB.addReg(SRegs[i]); 5028 MIB.addImm(RI.getSubRegFromChannel(i)); 5029 } 5030 return DstReg; 5031 } 5032 5033 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 5034 MachineInstr &MI) const { 5035 5036 // If the pointer is store in VGPRs, then we need to move them to 5037 // SGPRs using v_readfirstlane. This is safe because we only select 5038 // loads with uniform pointers to SMRD instruction so we know the 5039 // pointer value is uniform. 5040 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 5041 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 5042 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 5043 SBase->setReg(SGPR); 5044 } 5045 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); 5046 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 5047 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 5048 SOff->setReg(SGPR); 5049 } 5050 } 5051 5052 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { 5053 unsigned Opc = Inst.getOpcode(); 5054 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 5055 if (OldSAddrIdx < 0) 5056 return false; 5057 5058 assert(isSegmentSpecificFLAT(Inst)); 5059 5060 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); 5061 if (NewOpc < 0) 5062 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); 5063 if (NewOpc < 0) 5064 return false; 5065 5066 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); 5067 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); 5068 if (RI.isSGPRReg(MRI, SAddr.getReg())) 5069 return false; 5070 5071 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); 5072 if (NewVAddrIdx < 0) 5073 return false; 5074 5075 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 5076 5077 // Check vaddr, it shall be zero or absent. 5078 MachineInstr *VAddrDef = nullptr; 5079 if (OldVAddrIdx >= 0) { 5080 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); 5081 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); 5082 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || 5083 !VAddrDef->getOperand(1).isImm() || 5084 VAddrDef->getOperand(1).getImm() != 0) 5085 return false; 5086 } 5087 5088 const MCInstrDesc &NewDesc = get(NewOpc); 5089 Inst.setDesc(NewDesc); 5090 5091 // Callers expect interator to be valid after this call, so modify the 5092 // instruction in place. 5093 if (OldVAddrIdx == NewVAddrIdx) { 5094 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); 5095 // Clear use list from the old vaddr holding a zero register. 5096 MRI.removeRegOperandFromUseList(&NewVAddr); 5097 MRI.moveOperands(&NewVAddr, &SAddr, 1); 5098 Inst.RemoveOperand(OldSAddrIdx); 5099 // Update the use list with the pointer we have just moved from vaddr to 5100 // saddr poisition. Otherwise new vaddr will be missing from the use list. 5101 MRI.removeRegOperandFromUseList(&NewVAddr); 5102 MRI.addRegOperandToUseList(&NewVAddr); 5103 } else { 5104 assert(OldSAddrIdx == NewVAddrIdx); 5105 5106 if (OldVAddrIdx >= 0) { 5107 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, 5108 AMDGPU::OpName::vdst_in); 5109 5110 // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so 5111 // it asserts. Untie the operands for now and retie them afterwards. 5112 if (NewVDstIn != -1) { 5113 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); 5114 Inst.untieRegOperand(OldVDstIn); 5115 } 5116 5117 Inst.RemoveOperand(OldVAddrIdx); 5118 5119 if (NewVDstIn != -1) { 5120 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 5121 Inst.tieOperands(NewVDst, NewVDstIn); 5122 } 5123 } 5124 } 5125 5126 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) 5127 VAddrDef->eraseFromParent(); 5128 5129 return true; 5130 } 5131 5132 // FIXME: Remove this when SelectionDAG is obsoleted. 5133 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, 5134 MachineInstr &MI) const { 5135 if (!isSegmentSpecificFLAT(MI)) 5136 return; 5137 5138 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence 5139 // thinks they are uniform, so a readfirstlane should be valid. 5140 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); 5141 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) 5142 return; 5143 5144 if (moveFlatAddrToVGPR(MI)) 5145 return; 5146 5147 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); 5148 SAddr->setReg(ToSGPR); 5149 } 5150 5151 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 5152 MachineBasicBlock::iterator I, 5153 const TargetRegisterClass *DstRC, 5154 MachineOperand &Op, 5155 MachineRegisterInfo &MRI, 5156 const DebugLoc &DL) const { 5157 Register OpReg = Op.getReg(); 5158 unsigned OpSubReg = Op.getSubReg(); 5159 5160 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 5161 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 5162 5163 // Check if operand is already the correct register class. 5164 if (DstRC == OpRC) 5165 return; 5166 5167 Register DstReg = MRI.createVirtualRegister(DstRC); 5168 MachineInstr *Copy = 5169 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 5170 5171 Op.setReg(DstReg); 5172 Op.setSubReg(0); 5173 5174 MachineInstr *Def = MRI.getVRegDef(OpReg); 5175 if (!Def) 5176 return; 5177 5178 // Try to eliminate the copy if it is copying an immediate value. 5179 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 5180 FoldImmediate(*Copy, *Def, OpReg, &MRI); 5181 5182 bool ImpDef = Def->isImplicitDef(); 5183 while (!ImpDef && Def && Def->isCopy()) { 5184 if (Def->getOperand(1).getReg().isPhysical()) 5185 break; 5186 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 5187 ImpDef = Def && Def->isImplicitDef(); 5188 } 5189 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 5190 !ImpDef) 5191 Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); 5192 } 5193 5194 // Emit the actual waterfall loop, executing the wrapped instruction for each 5195 // unique value of \p Rsrc across all lanes. In the best case we execute 1 5196 // iteration, in the worst case we execute 64 (once per lane). 5197 static void 5198 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, 5199 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, 5200 const DebugLoc &DL, MachineOperand &Rsrc) { 5201 MachineFunction &MF = *OrigBB.getParent(); 5202 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 5203 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5204 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5205 unsigned SaveExecOpc = 5206 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 5207 unsigned XorTermOpc = 5208 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 5209 unsigned AndOpc = 5210 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 5211 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5212 5213 MachineBasicBlock::iterator I = LoopBB.begin(); 5214 5215 SmallVector<Register, 8> ReadlanePieces; 5216 Register CondReg = AMDGPU::NoRegister; 5217 5218 Register VRsrc = Rsrc.getReg(); 5219 unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); 5220 5221 unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); 5222 unsigned NumSubRegs = RegSize / 32; 5223 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size"); 5224 5225 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { 5226 5227 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5228 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5229 5230 // Read the next variant <- also loop target. 5231 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) 5232 .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); 5233 5234 // Read the next variant <- also loop target. 5235 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) 5236 .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); 5237 5238 ReadlanePieces.push_back(CurRegLo); 5239 ReadlanePieces.push_back(CurRegHi); 5240 5241 // Comparison is to be done as 64-bit. 5242 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 5243 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) 5244 .addReg(CurRegLo) 5245 .addImm(AMDGPU::sub0) 5246 .addReg(CurRegHi) 5247 .addImm(AMDGPU::sub1); 5248 5249 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); 5250 auto Cmp = 5251 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) 5252 .addReg(CurReg); 5253 if (NumSubRegs <= 2) 5254 Cmp.addReg(VRsrc); 5255 else 5256 Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); 5257 5258 // Combine the comparision results with AND. 5259 if (CondReg == AMDGPU::NoRegister) // First. 5260 CondReg = NewCondReg; 5261 else { // If not the first, we create an AND. 5262 Register AndReg = MRI.createVirtualRegister(BoolXExecRC); 5263 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) 5264 .addReg(CondReg) 5265 .addReg(NewCondReg); 5266 CondReg = AndReg; 5267 } 5268 } // End for loop. 5269 5270 auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); 5271 Register SRsrc = MRI.createVirtualRegister(SRsrcRC); 5272 5273 // Build scalar Rsrc. 5274 auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); 5275 unsigned Channel = 0; 5276 for (Register Piece : ReadlanePieces) { 5277 Merge.addReg(Piece) 5278 .addImm(TRI->getSubRegFromChannel(Channel++)); 5279 } 5280 5281 // Update Rsrc operand to use the SGPR Rsrc. 5282 Rsrc.setReg(SRsrc); 5283 Rsrc.setIsKill(true); 5284 5285 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 5286 MRI.setSimpleHint(SaveExec, CondReg); 5287 5288 // Update EXEC to matching lanes, saving original to SaveExec. 5289 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 5290 .addReg(CondReg, RegState::Kill); 5291 5292 // The original instruction is here; we insert the terminators after it. 5293 I = LoopBB.end(); 5294 5295 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 5296 BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec) 5297 .addReg(Exec) 5298 .addReg(SaveExec); 5299 5300 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); 5301 } 5302 5303 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register 5304 // with SGPRs by iterating over all unique values across all lanes. 5305 // Returns the loop basic block that now contains \p MI. 5306 static MachineBasicBlock * 5307 loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 5308 MachineOperand &Rsrc, MachineDominatorTree *MDT, 5309 MachineBasicBlock::iterator Begin = nullptr, 5310 MachineBasicBlock::iterator End = nullptr) { 5311 MachineBasicBlock &MBB = *MI.getParent(); 5312 MachineFunction &MF = *MBB.getParent(); 5313 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 5314 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5315 MachineRegisterInfo &MRI = MF.getRegInfo(); 5316 if (!Begin.isValid()) 5317 Begin = &MI; 5318 if (!End.isValid()) { 5319 End = &MI; 5320 ++End; 5321 } 5322 const DebugLoc &DL = MI.getDebugLoc(); 5323 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5324 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 5325 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5326 5327 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 5328 5329 // Save the EXEC mask 5330 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 5331 5332 // Killed uses in the instruction we are waterfalling around will be 5333 // incorrect due to the added control-flow. 5334 MachineBasicBlock::iterator AfterMI = MI; 5335 ++AfterMI; 5336 for (auto I = Begin; I != AfterMI; I++) { 5337 for (auto &MO : I->uses()) { 5338 if (MO.isReg() && MO.isUse()) { 5339 MRI.clearKillFlags(MO.getReg()); 5340 } 5341 } 5342 } 5343 5344 // To insert the loop we need to split the block. Move everything after this 5345 // point to a new block, and insert a new empty block between the two. 5346 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 5347 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 5348 MachineFunction::iterator MBBI(MBB); 5349 ++MBBI; 5350 5351 MF.insert(MBBI, LoopBB); 5352 MF.insert(MBBI, RemainderBB); 5353 5354 LoopBB->addSuccessor(LoopBB); 5355 LoopBB->addSuccessor(RemainderBB); 5356 5357 // Move Begin to MI to the LoopBB, and the remainder of the block to 5358 // RemainderBB. 5359 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 5360 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); 5361 LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end()); 5362 5363 MBB.addSuccessor(LoopBB); 5364 5365 // Update dominators. We know that MBB immediately dominates LoopBB, that 5366 // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately 5367 // dominates all of the successors transferred to it from MBB that MBB used 5368 // to properly dominate. 5369 if (MDT) { 5370 MDT->addNewBlock(LoopBB, &MBB); 5371 MDT->addNewBlock(RemainderBB, LoopBB); 5372 for (auto &Succ : RemainderBB->successors()) { 5373 if (MDT->properlyDominates(&MBB, Succ)) { 5374 MDT->changeImmediateDominator(Succ, RemainderBB); 5375 } 5376 } 5377 } 5378 5379 emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); 5380 5381 // Restore the EXEC mask 5382 MachineBasicBlock::iterator First = RemainderBB->begin(); 5383 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 5384 return LoopBB; 5385 } 5386 5387 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 5388 static std::tuple<unsigned, unsigned> 5389 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 5390 MachineBasicBlock &MBB = *MI.getParent(); 5391 MachineFunction &MF = *MBB.getParent(); 5392 MachineRegisterInfo &MRI = MF.getRegInfo(); 5393 5394 // Extract the ptr from the resource descriptor. 5395 unsigned RsrcPtr = 5396 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 5397 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 5398 5399 // Create an empty resource descriptor 5400 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5401 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5402 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5403 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 5404 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 5405 5406 // Zero64 = 0 5407 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 5408 .addImm(0); 5409 5410 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 5411 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 5412 .addImm(RsrcDataFormat & 0xFFFFFFFF); 5413 5414 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 5415 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 5416 .addImm(RsrcDataFormat >> 32); 5417 5418 // NewSRsrc = {Zero64, SRsrcFormat} 5419 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 5420 .addReg(Zero64) 5421 .addImm(AMDGPU::sub0_sub1) 5422 .addReg(SRsrcFormatLo) 5423 .addImm(AMDGPU::sub2) 5424 .addReg(SRsrcFormatHi) 5425 .addImm(AMDGPU::sub3); 5426 5427 return std::make_tuple(RsrcPtr, NewSRsrc); 5428 } 5429 5430 MachineBasicBlock * 5431 SIInstrInfo::legalizeOperands(MachineInstr &MI, 5432 MachineDominatorTree *MDT) const { 5433 MachineFunction &MF = *MI.getParent()->getParent(); 5434 MachineRegisterInfo &MRI = MF.getRegInfo(); 5435 MachineBasicBlock *CreatedBB = nullptr; 5436 5437 // Legalize VOP2 5438 if (isVOP2(MI) || isVOPC(MI)) { 5439 legalizeOperandsVOP2(MRI, MI); 5440 return CreatedBB; 5441 } 5442 5443 // Legalize VOP3 5444 if (isVOP3(MI)) { 5445 legalizeOperandsVOP3(MRI, MI); 5446 return CreatedBB; 5447 } 5448 5449 // Legalize SMRD 5450 if (isSMRD(MI)) { 5451 legalizeOperandsSMRD(MRI, MI); 5452 return CreatedBB; 5453 } 5454 5455 // Legalize FLAT 5456 if (isFLAT(MI)) { 5457 legalizeOperandsFLAT(MRI, MI); 5458 return CreatedBB; 5459 } 5460 5461 // Legalize REG_SEQUENCE and PHI 5462 // The register class of the operands much be the same type as the register 5463 // class of the output. 5464 if (MI.getOpcode() == AMDGPU::PHI) { 5465 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 5466 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 5467 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) 5468 continue; 5469 const TargetRegisterClass *OpRC = 5470 MRI.getRegClass(MI.getOperand(i).getReg()); 5471 if (RI.hasVectorRegisters(OpRC)) { 5472 VRC = OpRC; 5473 } else { 5474 SRC = OpRC; 5475 } 5476 } 5477 5478 // If any of the operands are VGPR registers, then they all most be 5479 // otherwise we will create illegal VGPR->SGPR copies when legalizing 5480 // them. 5481 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 5482 if (!VRC) { 5483 assert(SRC); 5484 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 5485 VRC = &AMDGPU::VReg_1RegClass; 5486 } else 5487 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 5488 ? RI.getEquivalentAGPRClass(SRC) 5489 : RI.getEquivalentVGPRClass(SRC); 5490 } else { 5491 VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) 5492 ? RI.getEquivalentAGPRClass(VRC) 5493 : RI.getEquivalentVGPRClass(VRC); 5494 } 5495 RC = VRC; 5496 } else { 5497 RC = SRC; 5498 } 5499 5500 // Update all the operands so they have the same type. 5501 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 5502 MachineOperand &Op = MI.getOperand(I); 5503 if (!Op.isReg() || !Op.getReg().isVirtual()) 5504 continue; 5505 5506 // MI is a PHI instruction. 5507 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 5508 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 5509 5510 // Avoid creating no-op copies with the same src and dst reg class. These 5511 // confuse some of the machine passes. 5512 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 5513 } 5514 } 5515 5516 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 5517 // VGPR dest type and SGPR sources, insert copies so all operands are 5518 // VGPRs. This seems to help operand folding / the register coalescer. 5519 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 5520 MachineBasicBlock *MBB = MI.getParent(); 5521 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 5522 if (RI.hasVGPRs(DstRC)) { 5523 // Update all the operands so they are VGPR register classes. These may 5524 // not be the same register class because REG_SEQUENCE supports mixing 5525 // subregister index types e.g. sub0_sub1 + sub2 + sub3 5526 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 5527 MachineOperand &Op = MI.getOperand(I); 5528 if (!Op.isReg() || !Op.getReg().isVirtual()) 5529 continue; 5530 5531 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 5532 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 5533 if (VRC == OpRC) 5534 continue; 5535 5536 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 5537 Op.setIsKill(); 5538 } 5539 } 5540 5541 return CreatedBB; 5542 } 5543 5544 // Legalize INSERT_SUBREG 5545 // src0 must have the same register class as dst 5546 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 5547 Register Dst = MI.getOperand(0).getReg(); 5548 Register Src0 = MI.getOperand(1).getReg(); 5549 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 5550 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 5551 if (DstRC != Src0RC) { 5552 MachineBasicBlock *MBB = MI.getParent(); 5553 MachineOperand &Op = MI.getOperand(1); 5554 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 5555 } 5556 return CreatedBB; 5557 } 5558 5559 // Legalize SI_INIT_M0 5560 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 5561 MachineOperand &Src = MI.getOperand(0); 5562 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 5563 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 5564 return CreatedBB; 5565 } 5566 5567 // Legalize MIMG and MUBUF/MTBUF for shaders. 5568 // 5569 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 5570 // scratch memory access. In both cases, the legalization never involves 5571 // conversion to the addr64 form. 5572 if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && 5573 (isMUBUF(MI) || isMTBUF(MI)))) { 5574 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 5575 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) 5576 CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); 5577 5578 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 5579 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) 5580 CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); 5581 5582 return CreatedBB; 5583 } 5584 5585 // Legalize SI_CALL 5586 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { 5587 MachineOperand *Dest = &MI.getOperand(0); 5588 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { 5589 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and 5590 // following copies, we also need to move copies from and to physical 5591 // registers into the loop block. 5592 unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); 5593 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); 5594 5595 // Also move the copies to physical registers into the loop block 5596 MachineBasicBlock &MBB = *MI.getParent(); 5597 MachineBasicBlock::iterator Start(&MI); 5598 while (Start->getOpcode() != FrameSetupOpcode) 5599 --Start; 5600 MachineBasicBlock::iterator End(&MI); 5601 while (End->getOpcode() != FrameDestroyOpcode) 5602 ++End; 5603 // Also include following copies of the return value 5604 ++End; 5605 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && 5606 MI.definesRegister(End->getOperand(1).getReg())) 5607 ++End; 5608 CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); 5609 } 5610 } 5611 5612 // Legalize MUBUF* instructions. 5613 int RsrcIdx = 5614 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 5615 if (RsrcIdx != -1) { 5616 // We have an MUBUF instruction 5617 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 5618 unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; 5619 if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), 5620 RI.getRegClass(RsrcRC))) { 5621 // The operands are legal. 5622 // FIXME: We may need to legalize operands besided srsrc. 5623 return CreatedBB; 5624 } 5625 5626 // Legalize a VGPR Rsrc. 5627 // 5628 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 5629 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 5630 // a zero-value SRsrc. 5631 // 5632 // If the instruction is _OFFSET (both idxen and offen disabled), and we 5633 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 5634 // above. 5635 // 5636 // Otherwise we are on non-ADDR64 hardware, and/or we have 5637 // idxen/offen/bothen and we fall back to a waterfall loop. 5638 5639 MachineBasicBlock &MBB = *MI.getParent(); 5640 5641 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 5642 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 5643 // This is already an ADDR64 instruction so we need to add the pointer 5644 // extracted from the resource descriptor to the current value of VAddr. 5645 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5646 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 5647 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5648 5649 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5650 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 5651 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 5652 5653 unsigned RsrcPtr, NewSRsrc; 5654 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5655 5656 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 5657 const DebugLoc &DL = MI.getDebugLoc(); 5658 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) 5659 .addDef(CondReg0) 5660 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5661 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 5662 .addImm(0); 5663 5664 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 5665 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 5666 .addDef(CondReg1, RegState::Dead) 5667 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5668 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 5669 .addReg(CondReg0, RegState::Kill) 5670 .addImm(0); 5671 5672 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5673 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 5674 .addReg(NewVAddrLo) 5675 .addImm(AMDGPU::sub0) 5676 .addReg(NewVAddrHi) 5677 .addImm(AMDGPU::sub1); 5678 5679 VAddr->setReg(NewVAddr); 5680 Rsrc->setReg(NewSRsrc); 5681 } else if (!VAddr && ST.hasAddr64()) { 5682 // This instructions is the _OFFSET variant, so we need to convert it to 5683 // ADDR64. 5684 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && 5685 "FIXME: Need to emit flat atomics here"); 5686 5687 unsigned RsrcPtr, NewSRsrc; 5688 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 5689 5690 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 5691 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 5692 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 5693 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 5694 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 5695 5696 // Atomics rith return have have an additional tied operand and are 5697 // missing some of the special bits. 5698 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 5699 MachineInstr *Addr64; 5700 5701 if (!VDataIn) { 5702 // Regular buffer load / store. 5703 MachineInstrBuilder MIB = 5704 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5705 .add(*VData) 5706 .addReg(NewVAddr) 5707 .addReg(NewSRsrc) 5708 .add(*SOffset) 5709 .add(*Offset); 5710 5711 if (const MachineOperand *CPol = 5712 getNamedOperand(MI, AMDGPU::OpName::cpol)) { 5713 MIB.addImm(CPol->getImm()); 5714 } 5715 5716 if (const MachineOperand *TFE = 5717 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 5718 MIB.addImm(TFE->getImm()); 5719 } 5720 5721 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 5722 5723 MIB.cloneMemRefs(MI); 5724 Addr64 = MIB; 5725 } else { 5726 // Atomics with return. 5727 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 5728 .add(*VData) 5729 .add(*VDataIn) 5730 .addReg(NewVAddr) 5731 .addReg(NewSRsrc) 5732 .add(*SOffset) 5733 .add(*Offset) 5734 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol)) 5735 .cloneMemRefs(MI); 5736 } 5737 5738 MI.removeFromParent(); 5739 5740 // NewVaddr = {NewVaddrHi, NewVaddrLo} 5741 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 5742 NewVAddr) 5743 .addReg(RsrcPtr, 0, AMDGPU::sub0) 5744 .addImm(AMDGPU::sub0) 5745 .addReg(RsrcPtr, 0, AMDGPU::sub1) 5746 .addImm(AMDGPU::sub1); 5747 } else { 5748 // This is another variant; legalize Rsrc with waterfall loop from VGPRs 5749 // to SGPRs. 5750 CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); 5751 return CreatedBB; 5752 } 5753 } 5754 return CreatedBB; 5755 } 5756 5757 MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, 5758 MachineDominatorTree *MDT) const { 5759 SetVectorType Worklist; 5760 Worklist.insert(&TopInst); 5761 MachineBasicBlock *CreatedBB = nullptr; 5762 MachineBasicBlock *CreatedBBTmp = nullptr; 5763 5764 while (!Worklist.empty()) { 5765 MachineInstr &Inst = *Worklist.pop_back_val(); 5766 MachineBasicBlock *MBB = Inst.getParent(); 5767 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 5768 5769 unsigned Opcode = Inst.getOpcode(); 5770 unsigned NewOpcode = getVALUOp(Inst); 5771 5772 // Handle some special cases 5773 switch (Opcode) { 5774 default: 5775 break; 5776 case AMDGPU::S_ADD_U64_PSEUDO: 5777 case AMDGPU::S_SUB_U64_PSEUDO: 5778 splitScalar64BitAddSub(Worklist, Inst, MDT); 5779 Inst.eraseFromParent(); 5780 continue; 5781 case AMDGPU::S_ADD_I32: 5782 case AMDGPU::S_SUB_I32: { 5783 // FIXME: The u32 versions currently selected use the carry. 5784 bool Changed; 5785 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); 5786 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 5787 CreatedBB = CreatedBBTmp; 5788 if (Changed) 5789 continue; 5790 5791 // Default handling 5792 break; 5793 } 5794 case AMDGPU::S_AND_B64: 5795 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 5796 Inst.eraseFromParent(); 5797 continue; 5798 5799 case AMDGPU::S_OR_B64: 5800 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 5801 Inst.eraseFromParent(); 5802 continue; 5803 5804 case AMDGPU::S_XOR_B64: 5805 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 5806 Inst.eraseFromParent(); 5807 continue; 5808 5809 case AMDGPU::S_NAND_B64: 5810 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 5811 Inst.eraseFromParent(); 5812 continue; 5813 5814 case AMDGPU::S_NOR_B64: 5815 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 5816 Inst.eraseFromParent(); 5817 continue; 5818 5819 case AMDGPU::S_XNOR_B64: 5820 if (ST.hasDLInsts()) 5821 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 5822 else 5823 splitScalar64BitXnor(Worklist, Inst, MDT); 5824 Inst.eraseFromParent(); 5825 continue; 5826 5827 case AMDGPU::S_ANDN2_B64: 5828 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 5829 Inst.eraseFromParent(); 5830 continue; 5831 5832 case AMDGPU::S_ORN2_B64: 5833 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 5834 Inst.eraseFromParent(); 5835 continue; 5836 5837 case AMDGPU::S_BREV_B64: 5838 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); 5839 Inst.eraseFromParent(); 5840 continue; 5841 5842 case AMDGPU::S_NOT_B64: 5843 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 5844 Inst.eraseFromParent(); 5845 continue; 5846 5847 case AMDGPU::S_BCNT1_I32_B64: 5848 splitScalar64BitBCNT(Worklist, Inst); 5849 Inst.eraseFromParent(); 5850 continue; 5851 5852 case AMDGPU::S_BFE_I64: 5853 splitScalar64BitBFE(Worklist, Inst); 5854 Inst.eraseFromParent(); 5855 continue; 5856 5857 case AMDGPU::S_LSHL_B32: 5858 if (ST.hasOnlyRevVALUShifts()) { 5859 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 5860 swapOperands(Inst); 5861 } 5862 break; 5863 case AMDGPU::S_ASHR_I32: 5864 if (ST.hasOnlyRevVALUShifts()) { 5865 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 5866 swapOperands(Inst); 5867 } 5868 break; 5869 case AMDGPU::S_LSHR_B32: 5870 if (ST.hasOnlyRevVALUShifts()) { 5871 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 5872 swapOperands(Inst); 5873 } 5874 break; 5875 case AMDGPU::S_LSHL_B64: 5876 if (ST.hasOnlyRevVALUShifts()) { 5877 NewOpcode = AMDGPU::V_LSHLREV_B64_e64; 5878 swapOperands(Inst); 5879 } 5880 break; 5881 case AMDGPU::S_ASHR_I64: 5882 if (ST.hasOnlyRevVALUShifts()) { 5883 NewOpcode = AMDGPU::V_ASHRREV_I64_e64; 5884 swapOperands(Inst); 5885 } 5886 break; 5887 case AMDGPU::S_LSHR_B64: 5888 if (ST.hasOnlyRevVALUShifts()) { 5889 NewOpcode = AMDGPU::V_LSHRREV_B64_e64; 5890 swapOperands(Inst); 5891 } 5892 break; 5893 5894 case AMDGPU::S_ABS_I32: 5895 lowerScalarAbs(Worklist, Inst); 5896 Inst.eraseFromParent(); 5897 continue; 5898 5899 case AMDGPU::S_CBRANCH_SCC0: 5900 case AMDGPU::S_CBRANCH_SCC1: 5901 // Clear unused bits of vcc 5902 if (ST.isWave32()) 5903 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32), 5904 AMDGPU::VCC_LO) 5905 .addReg(AMDGPU::EXEC_LO) 5906 .addReg(AMDGPU::VCC_LO); 5907 else 5908 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), 5909 AMDGPU::VCC) 5910 .addReg(AMDGPU::EXEC) 5911 .addReg(AMDGPU::VCC); 5912 break; 5913 5914 case AMDGPU::S_BFE_U64: 5915 case AMDGPU::S_BFM_B64: 5916 llvm_unreachable("Moving this op to VALU not implemented"); 5917 5918 case AMDGPU::S_PACK_LL_B32_B16: 5919 case AMDGPU::S_PACK_LH_B32_B16: 5920 case AMDGPU::S_PACK_HH_B32_B16: 5921 movePackToVALU(Worklist, MRI, Inst); 5922 Inst.eraseFromParent(); 5923 continue; 5924 5925 case AMDGPU::S_XNOR_B32: 5926 lowerScalarXnor(Worklist, Inst); 5927 Inst.eraseFromParent(); 5928 continue; 5929 5930 case AMDGPU::S_NAND_B32: 5931 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 5932 Inst.eraseFromParent(); 5933 continue; 5934 5935 case AMDGPU::S_NOR_B32: 5936 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 5937 Inst.eraseFromParent(); 5938 continue; 5939 5940 case AMDGPU::S_ANDN2_B32: 5941 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 5942 Inst.eraseFromParent(); 5943 continue; 5944 5945 case AMDGPU::S_ORN2_B32: 5946 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 5947 Inst.eraseFromParent(); 5948 continue; 5949 5950 // TODO: remove as soon as everything is ready 5951 // to replace VGPR to SGPR copy with V_READFIRSTLANEs. 5952 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO 5953 // can only be selected from the uniform SDNode. 5954 case AMDGPU::S_ADD_CO_PSEUDO: 5955 case AMDGPU::S_SUB_CO_PSEUDO: { 5956 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 5957 ? AMDGPU::V_ADDC_U32_e64 5958 : AMDGPU::V_SUBB_U32_e64; 5959 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5960 5961 Register CarryInReg = Inst.getOperand(4).getReg(); 5962 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { 5963 Register NewCarryReg = MRI.createVirtualRegister(CarryRC); 5964 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) 5965 .addReg(CarryInReg); 5966 } 5967 5968 Register CarryOutReg = Inst.getOperand(1).getReg(); 5969 5970 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( 5971 MRI.getRegClass(Inst.getOperand(0).getReg()))); 5972 MachineInstr *CarryOp = 5973 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) 5974 .addReg(CarryOutReg, RegState::Define) 5975 .add(Inst.getOperand(2)) 5976 .add(Inst.getOperand(3)) 5977 .addReg(CarryInReg) 5978 .addImm(0); 5979 CreatedBBTmp = legalizeOperands(*CarryOp); 5980 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 5981 CreatedBB = CreatedBBTmp; 5982 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); 5983 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); 5984 Inst.eraseFromParent(); 5985 } 5986 continue; 5987 case AMDGPU::S_UADDO_PSEUDO: 5988 case AMDGPU::S_USUBO_PSEUDO: { 5989 const DebugLoc &DL = Inst.getDebugLoc(); 5990 MachineOperand &Dest0 = Inst.getOperand(0); 5991 MachineOperand &Dest1 = Inst.getOperand(1); 5992 MachineOperand &Src0 = Inst.getOperand(2); 5993 MachineOperand &Src1 = Inst.getOperand(3); 5994 5995 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 5996 ? AMDGPU::V_ADD_CO_U32_e64 5997 : AMDGPU::V_SUB_CO_U32_e64; 5998 const TargetRegisterClass *NewRC = 5999 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); 6000 Register DestReg = MRI.createVirtualRegister(NewRC); 6001 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) 6002 .addReg(Dest1.getReg(), RegState::Define) 6003 .add(Src0) 6004 .add(Src1) 6005 .addImm(0); // clamp bit 6006 6007 CreatedBBTmp = legalizeOperands(*NewInstr, MDT); 6008 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6009 CreatedBB = CreatedBBTmp; 6010 6011 MRI.replaceRegWith(Dest0.getReg(), DestReg); 6012 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, 6013 Worklist); 6014 Inst.eraseFromParent(); 6015 } 6016 continue; 6017 6018 case AMDGPU::S_CSELECT_B32: 6019 case AMDGPU::S_CSELECT_B64: 6020 lowerSelect(Worklist, Inst, MDT); 6021 Inst.eraseFromParent(); 6022 continue; 6023 } 6024 6025 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 6026 // We cannot move this instruction to the VALU, so we should try to 6027 // legalize its operands instead. 6028 CreatedBBTmp = legalizeOperands(Inst, MDT); 6029 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6030 CreatedBB = CreatedBBTmp; 6031 continue; 6032 } 6033 6034 // Use the new VALU Opcode. 6035 const MCInstrDesc &NewDesc = get(NewOpcode); 6036 Inst.setDesc(NewDesc); 6037 6038 // Remove any references to SCC. Vector instructions can't read from it, and 6039 // We're just about to add the implicit use / defs of VCC, and we don't want 6040 // both. 6041 for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { 6042 MachineOperand &Op = Inst.getOperand(i); 6043 if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { 6044 // Only propagate through live-def of SCC. 6045 if (Op.isDef() && !Op.isDead()) 6046 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 6047 if (Op.isUse()) 6048 addSCCDefsToVALUWorklist(Op, Worklist); 6049 Inst.RemoveOperand(i); 6050 } 6051 } 6052 6053 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 6054 // We are converting these to a BFE, so we need to add the missing 6055 // operands for the size and offset. 6056 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 6057 Inst.addOperand(MachineOperand::CreateImm(0)); 6058 Inst.addOperand(MachineOperand::CreateImm(Size)); 6059 6060 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 6061 // The VALU version adds the second operand to the result, so insert an 6062 // extra 0 operand. 6063 Inst.addOperand(MachineOperand::CreateImm(0)); 6064 } 6065 6066 Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); 6067 fixImplicitOperands(Inst); 6068 6069 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 6070 const MachineOperand &OffsetWidthOp = Inst.getOperand(2); 6071 // If we need to move this to VGPRs, we need to unpack the second operand 6072 // back into the 2 separate ones for bit offset and width. 6073 assert(OffsetWidthOp.isImm() && 6074 "Scalar BFE is only implemented for constant width and offset"); 6075 uint32_t Imm = OffsetWidthOp.getImm(); 6076 6077 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 6078 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 6079 Inst.RemoveOperand(2); // Remove old immediate. 6080 Inst.addOperand(MachineOperand::CreateImm(Offset)); 6081 Inst.addOperand(MachineOperand::CreateImm(BitWidth)); 6082 } 6083 6084 bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); 6085 unsigned NewDstReg = AMDGPU::NoRegister; 6086 if (HasDst) { 6087 Register DstReg = Inst.getOperand(0).getReg(); 6088 if (DstReg.isPhysical()) 6089 continue; 6090 6091 // Update the destination register class. 6092 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 6093 if (!NewDstRC) 6094 continue; 6095 6096 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && 6097 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 6098 // Instead of creating a copy where src and dst are the same register 6099 // class, we just replace all uses of dst with src. These kinds of 6100 // copies interfere with the heuristics MachineSink uses to decide 6101 // whether or not to split a critical edge. Since the pass assumes 6102 // that copies will end up as machine instructions and not be 6103 // eliminated. 6104 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 6105 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 6106 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 6107 Inst.getOperand(0).setReg(DstReg); 6108 6109 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 6110 // these are deleted later, but at -O0 it would leave a suspicious 6111 // looking illegal copy of an undef register. 6112 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 6113 Inst.RemoveOperand(I); 6114 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 6115 continue; 6116 } 6117 6118 NewDstReg = MRI.createVirtualRegister(NewDstRC); 6119 MRI.replaceRegWith(DstReg, NewDstReg); 6120 } 6121 6122 // Legalize the operands 6123 CreatedBBTmp = legalizeOperands(Inst, MDT); 6124 if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) 6125 CreatedBB = CreatedBBTmp; 6126 6127 if (HasDst) 6128 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 6129 } 6130 return CreatedBB; 6131 } 6132 6133 // Add/sub require special handling to deal with carry outs. 6134 std::pair<bool, MachineBasicBlock *> 6135 SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, 6136 MachineDominatorTree *MDT) const { 6137 if (ST.hasAddNoCarry()) { 6138 // Assume there is no user of scc since we don't select this in that case. 6139 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 6140 // is used. 6141 6142 MachineBasicBlock &MBB = *Inst.getParent(); 6143 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6144 6145 Register OldDstReg = Inst.getOperand(0).getReg(); 6146 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6147 6148 unsigned Opc = Inst.getOpcode(); 6149 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 6150 6151 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 6152 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 6153 6154 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 6155 Inst.RemoveOperand(3); 6156 6157 Inst.setDesc(get(NewOpc)); 6158 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 6159 Inst.addImplicitDefUseOperands(*MBB.getParent()); 6160 MRI.replaceRegWith(OldDstReg, ResultReg); 6161 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); 6162 6163 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6164 return std::make_pair(true, NewBB); 6165 } 6166 6167 return std::make_pair(false, nullptr); 6168 } 6169 6170 void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, 6171 MachineDominatorTree *MDT) const { 6172 6173 MachineBasicBlock &MBB = *Inst.getParent(); 6174 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6175 MachineBasicBlock::iterator MII = Inst; 6176 DebugLoc DL = Inst.getDebugLoc(); 6177 6178 MachineOperand &Dest = Inst.getOperand(0); 6179 MachineOperand &Src0 = Inst.getOperand(1); 6180 MachineOperand &Src1 = Inst.getOperand(2); 6181 MachineOperand &Cond = Inst.getOperand(3); 6182 6183 Register SCCSource = Cond.getReg(); 6184 // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead. 6185 if (!Cond.isUndef()) { 6186 for (MachineInstr &CandI : 6187 make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), 6188 Inst.getParent()->rend())) { 6189 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != 6190 -1) { 6191 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { 6192 SCCSource = CandI.getOperand(1).getReg(); 6193 } 6194 break; 6195 } 6196 } 6197 } 6198 6199 // If this is a trivial select where the condition is effectively not SCC 6200 // (SCCSource is a source of copy to SCC), then the select is semantically 6201 // equivalent to copying SCCSource. Hence, there is no need to create 6202 // V_CNDMASK, we can just use that and bail out. 6203 if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) && 6204 Src1.isImm() && (Src1.getImm() == 0)) { 6205 MRI.replaceRegWith(Dest.getReg(), SCCSource); 6206 return; 6207 } 6208 6209 const TargetRegisterClass *TC = ST.getWavefrontSize() == 64 6210 ? &AMDGPU::SReg_64_XEXECRegClass 6211 : &AMDGPU::SReg_32_XM0_XEXECRegClass; 6212 Register CopySCC = MRI.createVirtualRegister(TC); 6213 6214 if (SCCSource == AMDGPU::SCC) { 6215 // Insert a trivial select instead of creating a copy, because a copy from 6216 // SCC would semantically mean just copying a single bit, but we may need 6217 // the result to be a vector condition mask that needs preserving. 6218 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 6219 : AMDGPU::S_CSELECT_B32; 6220 auto NewSelect = 6221 BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); 6222 NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); 6223 } else { 6224 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource); 6225 } 6226 6227 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6228 6229 auto UpdatedInst = 6230 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) 6231 .addImm(0) 6232 .add(Src1) // False 6233 .addImm(0) 6234 .add(Src0) // True 6235 .addReg(CopySCC); 6236 6237 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6238 legalizeOperands(*UpdatedInst, MDT); 6239 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6240 } 6241 6242 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, 6243 MachineInstr &Inst) const { 6244 MachineBasicBlock &MBB = *Inst.getParent(); 6245 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6246 MachineBasicBlock::iterator MII = Inst; 6247 DebugLoc DL = Inst.getDebugLoc(); 6248 6249 MachineOperand &Dest = Inst.getOperand(0); 6250 MachineOperand &Src = Inst.getOperand(1); 6251 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6252 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6253 6254 unsigned SubOp = ST.hasAddNoCarry() ? 6255 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; 6256 6257 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 6258 .addImm(0) 6259 .addReg(Src.getReg()); 6260 6261 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 6262 .addReg(Src.getReg()) 6263 .addReg(TmpReg); 6264 6265 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6266 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6267 } 6268 6269 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, 6270 MachineInstr &Inst) const { 6271 MachineBasicBlock &MBB = *Inst.getParent(); 6272 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6273 MachineBasicBlock::iterator MII = Inst; 6274 const DebugLoc &DL = Inst.getDebugLoc(); 6275 6276 MachineOperand &Dest = Inst.getOperand(0); 6277 MachineOperand &Src0 = Inst.getOperand(1); 6278 MachineOperand &Src1 = Inst.getOperand(2); 6279 6280 if (ST.hasDLInsts()) { 6281 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6282 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 6283 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 6284 6285 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 6286 .add(Src0) 6287 .add(Src1); 6288 6289 MRI.replaceRegWith(Dest.getReg(), NewDest); 6290 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6291 } else { 6292 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 6293 // invert either source and then perform the XOR. If either source is a 6294 // scalar register, then we can leave the inversion on the scalar unit to 6295 // acheive a better distrubution of scalar and vector instructions. 6296 bool Src0IsSGPR = Src0.isReg() && 6297 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 6298 bool Src1IsSGPR = Src1.isReg() && 6299 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 6300 MachineInstr *Xor; 6301 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6302 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6303 6304 // Build a pair of scalar instructions and add them to the work list. 6305 // The next iteration over the work list will lower these to the vector 6306 // unit as necessary. 6307 if (Src0IsSGPR) { 6308 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 6309 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 6310 .addReg(Temp) 6311 .add(Src1); 6312 } else if (Src1IsSGPR) { 6313 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 6314 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 6315 .add(Src0) 6316 .addReg(Temp); 6317 } else { 6318 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 6319 .add(Src0) 6320 .add(Src1); 6321 MachineInstr *Not = 6322 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 6323 Worklist.insert(Not); 6324 } 6325 6326 MRI.replaceRegWith(Dest.getReg(), NewDest); 6327 6328 Worklist.insert(Xor); 6329 6330 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6331 } 6332 } 6333 6334 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, 6335 MachineInstr &Inst, 6336 unsigned Opcode) const { 6337 MachineBasicBlock &MBB = *Inst.getParent(); 6338 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6339 MachineBasicBlock::iterator MII = Inst; 6340 const DebugLoc &DL = Inst.getDebugLoc(); 6341 6342 MachineOperand &Dest = Inst.getOperand(0); 6343 MachineOperand &Src0 = Inst.getOperand(1); 6344 MachineOperand &Src1 = Inst.getOperand(2); 6345 6346 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6347 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6348 6349 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 6350 .add(Src0) 6351 .add(Src1); 6352 6353 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 6354 .addReg(Interm); 6355 6356 Worklist.insert(&Op); 6357 Worklist.insert(&Not); 6358 6359 MRI.replaceRegWith(Dest.getReg(), NewDest); 6360 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6361 } 6362 6363 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, 6364 MachineInstr &Inst, 6365 unsigned Opcode) const { 6366 MachineBasicBlock &MBB = *Inst.getParent(); 6367 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6368 MachineBasicBlock::iterator MII = Inst; 6369 const DebugLoc &DL = Inst.getDebugLoc(); 6370 6371 MachineOperand &Dest = Inst.getOperand(0); 6372 MachineOperand &Src0 = Inst.getOperand(1); 6373 MachineOperand &Src1 = Inst.getOperand(2); 6374 6375 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 6376 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 6377 6378 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 6379 .add(Src1); 6380 6381 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 6382 .add(Src0) 6383 .addReg(Interm); 6384 6385 Worklist.insert(&Not); 6386 Worklist.insert(&Op); 6387 6388 MRI.replaceRegWith(Dest.getReg(), NewDest); 6389 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6390 } 6391 6392 void SIInstrInfo::splitScalar64BitUnaryOp( 6393 SetVectorType &Worklist, MachineInstr &Inst, 6394 unsigned Opcode, bool Swap) const { 6395 MachineBasicBlock &MBB = *Inst.getParent(); 6396 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6397 6398 MachineOperand &Dest = Inst.getOperand(0); 6399 MachineOperand &Src0 = Inst.getOperand(1); 6400 DebugLoc DL = Inst.getDebugLoc(); 6401 6402 MachineBasicBlock::iterator MII = Inst; 6403 6404 const MCInstrDesc &InstDesc = get(Opcode); 6405 const TargetRegisterClass *Src0RC = Src0.isReg() ? 6406 MRI.getRegClass(Src0.getReg()) : 6407 &AMDGPU::SGPR_32RegClass; 6408 6409 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 6410 6411 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6412 AMDGPU::sub0, Src0SubRC); 6413 6414 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 6415 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 6416 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 6417 6418 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 6419 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 6420 6421 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6422 AMDGPU::sub1, Src0SubRC); 6423 6424 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 6425 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 6426 6427 if (Swap) 6428 std::swap(DestSub0, DestSub1); 6429 6430 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 6431 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 6432 .addReg(DestSub0) 6433 .addImm(AMDGPU::sub0) 6434 .addReg(DestSub1) 6435 .addImm(AMDGPU::sub1); 6436 6437 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 6438 6439 Worklist.insert(&LoHalf); 6440 Worklist.insert(&HiHalf); 6441 6442 // We don't need to legalizeOperands here because for a single operand, src0 6443 // will support any kind of input. 6444 6445 // Move all users of this moved value. 6446 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 6447 } 6448 6449 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, 6450 MachineInstr &Inst, 6451 MachineDominatorTree *MDT) const { 6452 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 6453 6454 MachineBasicBlock &MBB = *Inst.getParent(); 6455 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6456 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 6457 6458 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6459 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6460 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6461 6462 Register CarryReg = MRI.createVirtualRegister(CarryRC); 6463 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 6464 6465 MachineOperand &Dest = Inst.getOperand(0); 6466 MachineOperand &Src0 = Inst.getOperand(1); 6467 MachineOperand &Src1 = Inst.getOperand(2); 6468 const DebugLoc &DL = Inst.getDebugLoc(); 6469 MachineBasicBlock::iterator MII = Inst; 6470 6471 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 6472 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 6473 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 6474 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 6475 6476 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6477 AMDGPU::sub0, Src0SubRC); 6478 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6479 AMDGPU::sub0, Src1SubRC); 6480 6481 6482 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6483 AMDGPU::sub1, Src0SubRC); 6484 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6485 AMDGPU::sub1, Src1SubRC); 6486 6487 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 6488 MachineInstr *LoHalf = 6489 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 6490 .addReg(CarryReg, RegState::Define) 6491 .add(SrcReg0Sub0) 6492 .add(SrcReg1Sub0) 6493 .addImm(0); // clamp bit 6494 6495 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 6496 MachineInstr *HiHalf = 6497 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 6498 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 6499 .add(SrcReg0Sub1) 6500 .add(SrcReg1Sub1) 6501 .addReg(CarryReg, RegState::Kill) 6502 .addImm(0); // clamp bit 6503 6504 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 6505 .addReg(DestSub0) 6506 .addImm(AMDGPU::sub0) 6507 .addReg(DestSub1) 6508 .addImm(AMDGPU::sub1); 6509 6510 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 6511 6512 // Try to legalize the operands in case we need to swap the order to keep it 6513 // valid. 6514 legalizeOperands(*LoHalf, MDT); 6515 legalizeOperands(*HiHalf, MDT); 6516 6517 // Move all users of this moved vlaue. 6518 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 6519 } 6520 6521 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, 6522 MachineInstr &Inst, unsigned Opcode, 6523 MachineDominatorTree *MDT) const { 6524 MachineBasicBlock &MBB = *Inst.getParent(); 6525 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6526 6527 MachineOperand &Dest = Inst.getOperand(0); 6528 MachineOperand &Src0 = Inst.getOperand(1); 6529 MachineOperand &Src1 = Inst.getOperand(2); 6530 DebugLoc DL = Inst.getDebugLoc(); 6531 6532 MachineBasicBlock::iterator MII = Inst; 6533 6534 const MCInstrDesc &InstDesc = get(Opcode); 6535 const TargetRegisterClass *Src0RC = Src0.isReg() ? 6536 MRI.getRegClass(Src0.getReg()) : 6537 &AMDGPU::SGPR_32RegClass; 6538 6539 const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); 6540 const TargetRegisterClass *Src1RC = Src1.isReg() ? 6541 MRI.getRegClass(Src1.getReg()) : 6542 &AMDGPU::SGPR_32RegClass; 6543 6544 const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); 6545 6546 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6547 AMDGPU::sub0, Src0SubRC); 6548 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6549 AMDGPU::sub0, Src1SubRC); 6550 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 6551 AMDGPU::sub1, Src0SubRC); 6552 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 6553 AMDGPU::sub1, Src1SubRC); 6554 6555 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 6556 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 6557 const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); 6558 6559 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 6560 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 6561 .add(SrcReg0Sub0) 6562 .add(SrcReg1Sub0); 6563 6564 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 6565 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 6566 .add(SrcReg0Sub1) 6567 .add(SrcReg1Sub1); 6568 6569 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 6570 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 6571 .addReg(DestSub0) 6572 .addImm(AMDGPU::sub0) 6573 .addReg(DestSub1) 6574 .addImm(AMDGPU::sub1); 6575 6576 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 6577 6578 Worklist.insert(&LoHalf); 6579 Worklist.insert(&HiHalf); 6580 6581 // Move all users of this moved vlaue. 6582 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 6583 } 6584 6585 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, 6586 MachineInstr &Inst, 6587 MachineDominatorTree *MDT) const { 6588 MachineBasicBlock &MBB = *Inst.getParent(); 6589 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6590 6591 MachineOperand &Dest = Inst.getOperand(0); 6592 MachineOperand &Src0 = Inst.getOperand(1); 6593 MachineOperand &Src1 = Inst.getOperand(2); 6594 const DebugLoc &DL = Inst.getDebugLoc(); 6595 6596 MachineBasicBlock::iterator MII = Inst; 6597 6598 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 6599 6600 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 6601 6602 MachineOperand* Op0; 6603 MachineOperand* Op1; 6604 6605 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 6606 Op0 = &Src0; 6607 Op1 = &Src1; 6608 } else { 6609 Op0 = &Src1; 6610 Op1 = &Src0; 6611 } 6612 6613 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 6614 .add(*Op0); 6615 6616 Register NewDest = MRI.createVirtualRegister(DestRC); 6617 6618 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 6619 .addReg(Interm) 6620 .add(*Op1); 6621 6622 MRI.replaceRegWith(Dest.getReg(), NewDest); 6623 6624 Worklist.insert(&Xor); 6625 } 6626 6627 void SIInstrInfo::splitScalar64BitBCNT( 6628 SetVectorType &Worklist, MachineInstr &Inst) const { 6629 MachineBasicBlock &MBB = *Inst.getParent(); 6630 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6631 6632 MachineBasicBlock::iterator MII = Inst; 6633 const DebugLoc &DL = Inst.getDebugLoc(); 6634 6635 MachineOperand &Dest = Inst.getOperand(0); 6636 MachineOperand &Src = Inst.getOperand(1); 6637 6638 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 6639 const TargetRegisterClass *SrcRC = Src.isReg() ? 6640 MRI.getRegClass(Src.getReg()) : 6641 &AMDGPU::SGPR_32RegClass; 6642 6643 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6644 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6645 6646 const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); 6647 6648 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 6649 AMDGPU::sub0, SrcSubRC); 6650 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 6651 AMDGPU::sub1, SrcSubRC); 6652 6653 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 6654 6655 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 6656 6657 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6658 6659 // We don't need to legalize operands here. src0 for etiher instruction can be 6660 // an SGPR, and the second input is unused or determined here. 6661 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6662 } 6663 6664 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, 6665 MachineInstr &Inst) const { 6666 MachineBasicBlock &MBB = *Inst.getParent(); 6667 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6668 MachineBasicBlock::iterator MII = Inst; 6669 const DebugLoc &DL = Inst.getDebugLoc(); 6670 6671 MachineOperand &Dest = Inst.getOperand(0); 6672 uint32_t Imm = Inst.getOperand(2).getImm(); 6673 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 6674 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 6675 6676 (void) Offset; 6677 6678 // Only sext_inreg cases handled. 6679 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 6680 Offset == 0 && "Not implemented"); 6681 6682 if (BitWidth < 32) { 6683 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6684 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6685 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6686 6687 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) 6688 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 6689 .addImm(0) 6690 .addImm(BitWidth); 6691 6692 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 6693 .addImm(31) 6694 .addReg(MidRegLo); 6695 6696 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 6697 .addReg(MidRegLo) 6698 .addImm(AMDGPU::sub0) 6699 .addReg(MidRegHi) 6700 .addImm(AMDGPU::sub1); 6701 6702 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6703 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6704 return; 6705 } 6706 6707 MachineOperand &Src = Inst.getOperand(1); 6708 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6709 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6710 6711 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 6712 .addImm(31) 6713 .addReg(Src.getReg(), 0, AMDGPU::sub0); 6714 6715 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 6716 .addReg(Src.getReg(), 0, AMDGPU::sub0) 6717 .addImm(AMDGPU::sub0) 6718 .addReg(TmpReg) 6719 .addImm(AMDGPU::sub1); 6720 6721 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6722 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6723 } 6724 6725 void SIInstrInfo::addUsersToMoveToVALUWorklist( 6726 Register DstReg, 6727 MachineRegisterInfo &MRI, 6728 SetVectorType &Worklist) const { 6729 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 6730 E = MRI.use_end(); I != E;) { 6731 MachineInstr &UseMI = *I->getParent(); 6732 6733 unsigned OpNo = 0; 6734 6735 switch (UseMI.getOpcode()) { 6736 case AMDGPU::COPY: 6737 case AMDGPU::WQM: 6738 case AMDGPU::SOFT_WQM: 6739 case AMDGPU::STRICT_WWM: 6740 case AMDGPU::STRICT_WQM: 6741 case AMDGPU::REG_SEQUENCE: 6742 case AMDGPU::PHI: 6743 case AMDGPU::INSERT_SUBREG: 6744 break; 6745 default: 6746 OpNo = I.getOperandNo(); 6747 break; 6748 } 6749 6750 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 6751 Worklist.insert(&UseMI); 6752 6753 do { 6754 ++I; 6755 } while (I != E && I->getParent() == &UseMI); 6756 } else { 6757 ++I; 6758 } 6759 } 6760 } 6761 6762 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, 6763 MachineRegisterInfo &MRI, 6764 MachineInstr &Inst) const { 6765 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6766 MachineBasicBlock *MBB = Inst.getParent(); 6767 MachineOperand &Src0 = Inst.getOperand(1); 6768 MachineOperand &Src1 = Inst.getOperand(2); 6769 const DebugLoc &DL = Inst.getDebugLoc(); 6770 6771 switch (Inst.getOpcode()) { 6772 case AMDGPU::S_PACK_LL_B32_B16: { 6773 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6774 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6775 6776 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 6777 // 0. 6778 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6779 .addImm(0xffff); 6780 6781 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 6782 .addReg(ImmReg, RegState::Kill) 6783 .add(Src0); 6784 6785 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) 6786 .add(Src1) 6787 .addImm(16) 6788 .addReg(TmpReg, RegState::Kill); 6789 break; 6790 } 6791 case AMDGPU::S_PACK_LH_B32_B16: { 6792 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6793 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6794 .addImm(0xffff); 6795 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg) 6796 .addReg(ImmReg, RegState::Kill) 6797 .add(Src0) 6798 .add(Src1); 6799 break; 6800 } 6801 case AMDGPU::S_PACK_HH_B32_B16: { 6802 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6803 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6804 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 6805 .addImm(16) 6806 .add(Src0); 6807 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 6808 .addImm(0xffff0000); 6809 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg) 6810 .add(Src1) 6811 .addReg(ImmReg, RegState::Kill) 6812 .addReg(TmpReg, RegState::Kill); 6813 break; 6814 } 6815 default: 6816 llvm_unreachable("unhandled s_pack_* instruction"); 6817 } 6818 6819 MachineOperand &Dest = Inst.getOperand(0); 6820 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6821 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6822 } 6823 6824 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 6825 MachineInstr &SCCDefInst, 6826 SetVectorType &Worklist) const { 6827 bool SCCUsedImplicitly = false; 6828 6829 // Ensure that def inst defines SCC, which is still live. 6830 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 6831 !Op.isDead() && Op.getParent() == &SCCDefInst); 6832 SmallVector<MachineInstr *, 4> CopyToDelete; 6833 // This assumes that all the users of SCC are in the same block 6834 // as the SCC def. 6835 for (MachineInstr &MI : // Skip the def inst itself. 6836 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 6837 SCCDefInst.getParent()->end())) { 6838 // Check if SCC is used first. 6839 if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { 6840 if (MI.isCopy()) { 6841 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6842 Register DestReg = MI.getOperand(0).getReg(); 6843 6844 for (auto &User : MRI.use_nodbg_instructions(DestReg)) { 6845 if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || 6846 (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { 6847 User.getOperand(4).setReg(RI.getVCC()); 6848 Worklist.insert(&User); 6849 } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) { 6850 User.getOperand(5).setReg(RI.getVCC()); 6851 // No need to add to Worklist. 6852 } 6853 } 6854 CopyToDelete.push_back(&MI); 6855 } else { 6856 if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || 6857 MI.getOpcode() == AMDGPU::S_CSELECT_B64) { 6858 // This is an implicit use of SCC and it is really expected by 6859 // the SCC users to handle. 6860 // We cannot preserve the edge to the user so add the explicit 6861 // copy: SCC = COPY VCC. 6862 // The copy will be cleaned up during the processing of the user 6863 // in lowerSelect. 6864 SCCUsedImplicitly = true; 6865 } 6866 6867 Worklist.insert(&MI); 6868 } 6869 } 6870 // Exit if we find another SCC def. 6871 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 6872 break; 6873 } 6874 for (auto &Copy : CopyToDelete) 6875 Copy->eraseFromParent(); 6876 6877 if (SCCUsedImplicitly) { 6878 BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()), 6879 SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC) 6880 .addReg(RI.getVCC()); 6881 } 6882 } 6883 6884 // Instructions that use SCC may be converted to VALU instructions. When that 6885 // happens, the SCC register is changed to VCC_LO. The instruction that defines 6886 // SCC must be changed to an instruction that defines VCC. This function makes 6887 // sure that the instruction that defines SCC is added to the moveToVALU 6888 // worklist. 6889 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op, 6890 SetVectorType &Worklist) const { 6891 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse()); 6892 6893 MachineInstr *SCCUseInst = Op.getParent(); 6894 // Look for a preceeding instruction that either defines VCC or SCC. If VCC 6895 // then there is nothing to do because the defining instruction has been 6896 // converted to a VALU already. If SCC then that instruction needs to be 6897 // converted to a VALU. 6898 for (MachineInstr &MI : 6899 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)), 6900 SCCUseInst->getParent()->rend())) { 6901 if (MI.modifiesRegister(AMDGPU::VCC, &RI)) 6902 break; 6903 if (MI.definesRegister(AMDGPU::SCC, &RI)) { 6904 Worklist.insert(&MI); 6905 break; 6906 } 6907 } 6908 } 6909 6910 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 6911 const MachineInstr &Inst) const { 6912 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 6913 6914 switch (Inst.getOpcode()) { 6915 // For target instructions, getOpRegClass just returns the virtual register 6916 // class associated with the operand, so we need to find an equivalent VGPR 6917 // register class in order to move the instruction to the VALU. 6918 case AMDGPU::COPY: 6919 case AMDGPU::PHI: 6920 case AMDGPU::REG_SEQUENCE: 6921 case AMDGPU::INSERT_SUBREG: 6922 case AMDGPU::WQM: 6923 case AMDGPU::SOFT_WQM: 6924 case AMDGPU::STRICT_WWM: 6925 case AMDGPU::STRICT_WQM: { 6926 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 6927 if (RI.hasAGPRs(SrcRC)) { 6928 if (RI.hasAGPRs(NewDstRC)) 6929 return nullptr; 6930 6931 switch (Inst.getOpcode()) { 6932 case AMDGPU::PHI: 6933 case AMDGPU::REG_SEQUENCE: 6934 case AMDGPU::INSERT_SUBREG: 6935 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 6936 break; 6937 default: 6938 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6939 } 6940 6941 if (!NewDstRC) 6942 return nullptr; 6943 } else { 6944 if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 6945 return nullptr; 6946 6947 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 6948 if (!NewDstRC) 6949 return nullptr; 6950 } 6951 6952 return NewDstRC; 6953 } 6954 default: 6955 return NewDstRC; 6956 } 6957 } 6958 6959 // Find the one SGPR operand we are allowed to use. 6960 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 6961 int OpIndices[3]) const { 6962 const MCInstrDesc &Desc = MI.getDesc(); 6963 6964 // Find the one SGPR operand we are allowed to use. 6965 // 6966 // First we need to consider the instruction's operand requirements before 6967 // legalizing. Some operands are required to be SGPRs, such as implicit uses 6968 // of VCC, but we are still bound by the constant bus requirement to only use 6969 // one. 6970 // 6971 // If the operand's class is an SGPR, we can never move it. 6972 6973 Register SGPRReg = findImplicitSGPRRead(MI); 6974 if (SGPRReg != AMDGPU::NoRegister) 6975 return SGPRReg; 6976 6977 Register UsedSGPRs[3] = { AMDGPU::NoRegister }; 6978 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 6979 6980 for (unsigned i = 0; i < 3; ++i) { 6981 int Idx = OpIndices[i]; 6982 if (Idx == -1) 6983 break; 6984 6985 const MachineOperand &MO = MI.getOperand(Idx); 6986 if (!MO.isReg()) 6987 continue; 6988 6989 // Is this operand statically required to be an SGPR based on the operand 6990 // constraints? 6991 const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); 6992 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 6993 if (IsRequiredSGPR) 6994 return MO.getReg(); 6995 6996 // If this could be a VGPR or an SGPR, Check the dynamic register class. 6997 Register Reg = MO.getReg(); 6998 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 6999 if (RI.isSGPRClass(RegRC)) 7000 UsedSGPRs[i] = Reg; 7001 } 7002 7003 // We don't have a required SGPR operand, so we have a bit more freedom in 7004 // selecting operands to move. 7005 7006 // Try to select the most used SGPR. If an SGPR is equal to one of the 7007 // others, we choose that. 7008 // 7009 // e.g. 7010 // V_FMA_F32 v0, s0, s0, s0 -> No moves 7011 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 7012 7013 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 7014 // prefer those. 7015 7016 if (UsedSGPRs[0] != AMDGPU::NoRegister) { 7017 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 7018 SGPRReg = UsedSGPRs[0]; 7019 } 7020 7021 if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) { 7022 if (UsedSGPRs[1] == UsedSGPRs[2]) 7023 SGPRReg = UsedSGPRs[1]; 7024 } 7025 7026 return SGPRReg; 7027 } 7028 7029 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 7030 unsigned OperandName) const { 7031 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 7032 if (Idx == -1) 7033 return nullptr; 7034 7035 return &MI.getOperand(Idx); 7036 } 7037 7038 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 7039 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 7040 return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) | 7041 (1ULL << 56) | // RESOURCE_LEVEL = 1 7042 (3ULL << 60); // OOB_SELECT = 3 7043 } 7044 7045 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 7046 if (ST.isAmdHsaOS()) { 7047 // Set ATC = 1. GFX9 doesn't have this bit. 7048 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 7049 RsrcDataFormat |= (1ULL << 56); 7050 7051 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 7052 // BTW, it disables TC L2 and therefore decreases performance. 7053 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 7054 RsrcDataFormat |= (2ULL << 59); 7055 } 7056 7057 return RsrcDataFormat; 7058 } 7059 7060 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 7061 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 7062 AMDGPU::RSRC_TID_ENABLE | 7063 0xffffffff; // Size; 7064 7065 // GFX9 doesn't have ELEMENT_SIZE. 7066 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 7067 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; 7068 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 7069 } 7070 7071 // IndexStride = 64 / 32. 7072 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 7073 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 7074 7075 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 7076 // Clear them unless we want a huge stride. 7077 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 7078 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 7079 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 7080 7081 return Rsrc23; 7082 } 7083 7084 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 7085 unsigned Opc = MI.getOpcode(); 7086 7087 return isSMRD(Opc); 7088 } 7089 7090 bool SIInstrInfo::isHighLatencyDef(int Opc) const { 7091 return get(Opc).mayLoad() && 7092 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 7093 } 7094 7095 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 7096 int &FrameIndex) const { 7097 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 7098 if (!Addr || !Addr->isFI()) 7099 return AMDGPU::NoRegister; 7100 7101 assert(!MI.memoperands_empty() && 7102 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 7103 7104 FrameIndex = Addr->getIndex(); 7105 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 7106 } 7107 7108 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 7109 int &FrameIndex) const { 7110 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 7111 assert(Addr && Addr->isFI()); 7112 FrameIndex = Addr->getIndex(); 7113 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 7114 } 7115 7116 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 7117 int &FrameIndex) const { 7118 if (!MI.mayLoad()) 7119 return AMDGPU::NoRegister; 7120 7121 if (isMUBUF(MI) || isVGPRSpill(MI)) 7122 return isStackAccess(MI, FrameIndex); 7123 7124 if (isSGPRSpill(MI)) 7125 return isSGPRStackAccess(MI, FrameIndex); 7126 7127 return AMDGPU::NoRegister; 7128 } 7129 7130 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 7131 int &FrameIndex) const { 7132 if (!MI.mayStore()) 7133 return AMDGPU::NoRegister; 7134 7135 if (isMUBUF(MI) || isVGPRSpill(MI)) 7136 return isStackAccess(MI, FrameIndex); 7137 7138 if (isSGPRSpill(MI)) 7139 return isSGPRStackAccess(MI, FrameIndex); 7140 7141 return AMDGPU::NoRegister; 7142 } 7143 7144 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 7145 unsigned Size = 0; 7146 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 7147 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 7148 while (++I != E && I->isInsideBundle()) { 7149 assert(!I->isBundle() && "No nested bundle!"); 7150 Size += getInstSizeInBytes(*I); 7151 } 7152 7153 return Size; 7154 } 7155 7156 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 7157 unsigned Opc = MI.getOpcode(); 7158 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 7159 unsigned DescSize = Desc.getSize(); 7160 7161 // If we have a definitive size, we can use it. Otherwise we need to inspect 7162 // the operands to know the size. 7163 if (isFixedSize(MI)) { 7164 unsigned Size = DescSize; 7165 7166 // If we hit the buggy offset, an extra nop will be inserted in MC so 7167 // estimate the worst case. 7168 if (MI.isBranch() && ST.hasOffset3fBug()) 7169 Size += 4; 7170 7171 return Size; 7172 } 7173 7174 // 4-byte instructions may have a 32-bit literal encoded after them. Check 7175 // operands that coud ever be literals. 7176 if (isVALU(MI) || isSALU(MI)) { 7177 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 7178 if (Src0Idx == -1) 7179 return DescSize; // No operands. 7180 7181 if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) 7182 return isVOP3(MI) ? 12 : (DescSize + 4); 7183 7184 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 7185 if (Src1Idx == -1) 7186 return DescSize; 7187 7188 if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) 7189 return isVOP3(MI) ? 12 : (DescSize + 4); 7190 7191 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); 7192 if (Src2Idx == -1) 7193 return DescSize; 7194 7195 if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) 7196 return isVOP3(MI) ? 12 : (DescSize + 4); 7197 7198 return DescSize; 7199 } 7200 7201 // Check whether we have extra NSA words. 7202 if (isMIMG(MI)) { 7203 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 7204 if (VAddr0Idx < 0) 7205 return 8; 7206 7207 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 7208 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 7209 } 7210 7211 switch (Opc) { 7212 case TargetOpcode::BUNDLE: 7213 return getInstBundleSize(MI); 7214 case TargetOpcode::INLINEASM: 7215 case TargetOpcode::INLINEASM_BR: { 7216 const MachineFunction *MF = MI.getParent()->getParent(); 7217 const char *AsmStr = MI.getOperand(0).getSymbolName(); 7218 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); 7219 } 7220 default: 7221 if (MI.isMetaInstruction()) 7222 return 0; 7223 return DescSize; 7224 } 7225 } 7226 7227 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 7228 if (!isFLAT(MI)) 7229 return false; 7230 7231 if (MI.memoperands_empty()) 7232 return true; 7233 7234 for (const MachineMemOperand *MMO : MI.memoperands()) { 7235 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 7236 return true; 7237 } 7238 return false; 7239 } 7240 7241 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 7242 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 7243 } 7244 7245 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 7246 MachineBasicBlock *IfEnd) const { 7247 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 7248 assert(TI != IfEntry->end()); 7249 7250 MachineInstr *Branch = &(*TI); 7251 MachineFunction *MF = IfEntry->getParent(); 7252 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 7253 7254 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 7255 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 7256 MachineInstr *SIIF = 7257 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 7258 .add(Branch->getOperand(0)) 7259 .add(Branch->getOperand(1)); 7260 MachineInstr *SIEND = 7261 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 7262 .addReg(DstReg); 7263 7264 IfEntry->erase(TI); 7265 IfEntry->insert(IfEntry->end(), SIIF); 7266 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 7267 } 7268 } 7269 7270 void SIInstrInfo::convertNonUniformLoopRegion( 7271 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 7272 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 7273 // We expect 2 terminators, one conditional and one unconditional. 7274 assert(TI != LoopEnd->end()); 7275 7276 MachineInstr *Branch = &(*TI); 7277 MachineFunction *MF = LoopEnd->getParent(); 7278 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 7279 7280 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 7281 7282 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 7283 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 7284 MachineInstrBuilder HeaderPHIBuilder = 7285 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 7286 for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), 7287 E = LoopEntry->pred_end(); 7288 PI != E; ++PI) { 7289 if (*PI == LoopEnd) { 7290 HeaderPHIBuilder.addReg(BackEdgeReg); 7291 } else { 7292 MachineBasicBlock *PMBB = *PI; 7293 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 7294 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 7295 ZeroReg, 0); 7296 HeaderPHIBuilder.addReg(ZeroReg); 7297 } 7298 HeaderPHIBuilder.addMBB(*PI); 7299 } 7300 MachineInstr *HeaderPhi = HeaderPHIBuilder; 7301 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 7302 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 7303 .addReg(DstReg) 7304 .add(Branch->getOperand(0)); 7305 MachineInstr *SILOOP = 7306 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 7307 .addReg(BackEdgeReg) 7308 .addMBB(LoopEntry); 7309 7310 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 7311 LoopEnd->erase(TI); 7312 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 7313 LoopEnd->insert(LoopEnd->end(), SILOOP); 7314 } 7315 } 7316 7317 ArrayRef<std::pair<int, const char *>> 7318 SIInstrInfo::getSerializableTargetIndices() const { 7319 static const std::pair<int, const char *> TargetIndices[] = { 7320 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 7321 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 7322 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 7323 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 7324 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 7325 return makeArrayRef(TargetIndices); 7326 } 7327 7328 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 7329 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 7330 ScheduleHazardRecognizer * 7331 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 7332 const ScheduleDAG *DAG) const { 7333 return new GCNHazardRecognizer(DAG->MF); 7334 } 7335 7336 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 7337 /// pass. 7338 ScheduleHazardRecognizer * 7339 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 7340 return new GCNHazardRecognizer(MF); 7341 } 7342 7343 std::pair<unsigned, unsigned> 7344 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 7345 return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); 7346 } 7347 7348 ArrayRef<std::pair<unsigned, const char *>> 7349 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 7350 static const std::pair<unsigned, const char *> TargetFlags[] = { 7351 { MO_GOTPCREL, "amdgpu-gotprel" }, 7352 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 7353 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 7354 { MO_REL32_LO, "amdgpu-rel32-lo" }, 7355 { MO_REL32_HI, "amdgpu-rel32-hi" }, 7356 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 7357 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 7358 }; 7359 7360 return makeArrayRef(TargetFlags); 7361 } 7362 7363 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 7364 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 7365 MI.modifiesRegister(AMDGPU::EXEC, &RI); 7366 } 7367 7368 MachineInstrBuilder 7369 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 7370 MachineBasicBlock::iterator I, 7371 const DebugLoc &DL, 7372 Register DestReg) const { 7373 if (ST.hasAddNoCarry()) 7374 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 7375 7376 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7377 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 7378 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 7379 7380 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 7381 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 7382 } 7383 7384 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 7385 MachineBasicBlock::iterator I, 7386 const DebugLoc &DL, 7387 Register DestReg, 7388 RegScavenger &RS) const { 7389 if (ST.hasAddNoCarry()) 7390 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 7391 7392 // If available, prefer to use vcc. 7393 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 7394 ? Register(RI.getVCC()) 7395 : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); 7396 7397 // TODO: Users need to deal with this. 7398 if (!UnusedCarry.isValid()) 7399 return MachineInstrBuilder(); 7400 7401 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 7402 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 7403 } 7404 7405 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 7406 switch (Opcode) { 7407 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 7408 case AMDGPU::SI_KILL_I1_TERMINATOR: 7409 return true; 7410 default: 7411 return false; 7412 } 7413 } 7414 7415 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 7416 switch (Opcode) { 7417 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 7418 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 7419 case AMDGPU::SI_KILL_I1_PSEUDO: 7420 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 7421 default: 7422 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 7423 } 7424 } 7425 7426 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 7427 if (!ST.isWave32()) 7428 return; 7429 7430 for (auto &Op : MI.implicit_operands()) { 7431 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 7432 Op.setReg(AMDGPU::VCC_LO); 7433 } 7434 } 7435 7436 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 7437 if (!isSMRD(MI)) 7438 return false; 7439 7440 // Check that it is using a buffer resource. 7441 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 7442 if (Idx == -1) // e.g. s_memtime 7443 return false; 7444 7445 const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; 7446 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 7447 } 7448 7449 // Depending on the used address space and instructions, some immediate offsets 7450 // are allowed and some are not. 7451 // In general, flat instruction offsets can only be non-negative, global and 7452 // scratch instruction offsets can also be negative. 7453 // 7454 // There are several bugs related to these offsets: 7455 // On gfx10.1, flat instructions that go into the global address space cannot 7456 // use an offset. 7457 // 7458 // For scratch instructions, the address can be either an SGPR or a VGPR. 7459 // The following offsets can be used, depending on the architecture (x means 7460 // cannot be used): 7461 // +----------------------------+------+------+ 7462 // | Address-Mode | SGPR | VGPR | 7463 // +----------------------------+------+------+ 7464 // | gfx9 | | | 7465 // | negative, 4-aligned offset | x | ok | 7466 // | negative, unaligned offset | x | ok | 7467 // +----------------------------+------+------+ 7468 // | gfx10 | | | 7469 // | negative, 4-aligned offset | ok | ok | 7470 // | negative, unaligned offset | ok | x | 7471 // +----------------------------+------+------+ 7472 // | gfx10.3 | | | 7473 // | negative, 4-aligned offset | ok | ok | 7474 // | negative, unaligned offset | ok | ok | 7475 // +----------------------------+------+------+ 7476 // 7477 // This function ignores the addressing mode, so if an offset cannot be used in 7478 // one addressing mode, it is considered illegal. 7479 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 7480 uint64_t FlatVariant) const { 7481 // TODO: Should 0 be special cased? 7482 if (!ST.hasFlatInstOffsets()) 7483 return false; 7484 7485 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT && 7486 (AddrSpace == AMDGPUAS::FLAT_ADDRESS || 7487 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) 7488 return false; 7489 7490 bool Signed = FlatVariant != SIInstrFlags::FLAT; 7491 if (ST.hasNegativeScratchOffsetBug() && 7492 FlatVariant == SIInstrFlags::FlatScratch) 7493 Signed = false; 7494 if (ST.hasNegativeUnalignedScratchOffsetBug() && 7495 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && 7496 (Offset % 4) != 0) { 7497 return false; 7498 } 7499 7500 unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); 7501 return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); 7502 } 7503 7504 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. 7505 std::pair<int64_t, int64_t> 7506 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, 7507 uint64_t FlatVariant) const { 7508 int64_t RemainderOffset = COffsetVal; 7509 int64_t ImmField = 0; 7510 bool Signed = FlatVariant != SIInstrFlags::FLAT; 7511 if (ST.hasNegativeScratchOffsetBug() && 7512 FlatVariant == SIInstrFlags::FlatScratch) 7513 Signed = false; 7514 7515 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed); 7516 if (Signed) { 7517 // Use signed division by a power of two to truncate towards 0. 7518 int64_t D = 1LL << (NumBits - 1); 7519 RemainderOffset = (COffsetVal / D) * D; 7520 ImmField = COffsetVal - RemainderOffset; 7521 7522 if (ST.hasNegativeUnalignedScratchOffsetBug() && 7523 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && 7524 (ImmField % 4) != 0) { 7525 // Make ImmField a multiple of 4 7526 RemainderOffset += ImmField % 4; 7527 ImmField -= ImmField % 4; 7528 } 7529 } else if (COffsetVal >= 0) { 7530 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); 7531 RemainderOffset = COffsetVal - ImmField; 7532 } 7533 7534 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)); 7535 assert(RemainderOffset + ImmField == COffsetVal); 7536 return {ImmField, RemainderOffset}; 7537 } 7538 7539 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td 7540 enum SIEncodingFamily { 7541 SI = 0, 7542 VI = 1, 7543 SDWA = 2, 7544 SDWA9 = 3, 7545 GFX80 = 4, 7546 GFX9 = 5, 7547 GFX10 = 6, 7548 SDWA10 = 7, 7549 GFX90A = 8 7550 }; 7551 7552 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { 7553 switch (ST.getGeneration()) { 7554 default: 7555 break; 7556 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 7557 case AMDGPUSubtarget::SEA_ISLANDS: 7558 return SIEncodingFamily::SI; 7559 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 7560 case AMDGPUSubtarget::GFX9: 7561 return SIEncodingFamily::VI; 7562 case AMDGPUSubtarget::GFX10: 7563 return SIEncodingFamily::GFX10; 7564 } 7565 llvm_unreachable("Unknown subtarget generation!"); 7566 } 7567 7568 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 7569 switch(MCOp) { 7570 // These opcodes use indirect register addressing so 7571 // they need special handling by codegen (currently missing). 7572 // Therefore it is too risky to allow these opcodes 7573 // to be selected by dpp combiner or sdwa peepholer. 7574 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 7575 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 7576 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 7577 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 7578 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 7579 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 7580 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 7581 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 7582 return true; 7583 default: 7584 return false; 7585 } 7586 } 7587 7588 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 7589 SIEncodingFamily Gen = subtargetEncodingFamily(ST); 7590 7591 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 7592 ST.getGeneration() == AMDGPUSubtarget::GFX9) 7593 Gen = SIEncodingFamily::GFX9; 7594 7595 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 7596 // subtarget has UnpackedD16VMem feature. 7597 // TODO: remove this when we discard GFX80 encoding. 7598 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 7599 Gen = SIEncodingFamily::GFX80; 7600 7601 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 7602 switch (ST.getGeneration()) { 7603 default: 7604 Gen = SIEncodingFamily::SDWA; 7605 break; 7606 case AMDGPUSubtarget::GFX9: 7607 Gen = SIEncodingFamily::SDWA9; 7608 break; 7609 case AMDGPUSubtarget::GFX10: 7610 Gen = SIEncodingFamily::SDWA10; 7611 break; 7612 } 7613 } 7614 7615 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 7616 7617 // -1 means that Opcode is already a native instruction. 7618 if (MCOp == -1) 7619 return Opcode; 7620 7621 if (ST.hasGFX90AInsts()) { 7622 uint16_t NMCOp = (uint16_t)-1; 7623 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); 7624 if (NMCOp == (uint16_t)-1) 7625 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); 7626 if (NMCOp != (uint16_t)-1) 7627 MCOp = NMCOp; 7628 } 7629 7630 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 7631 // no encoding in the given subtarget generation. 7632 if (MCOp == (uint16_t)-1) 7633 return -1; 7634 7635 if (isAsmOnlyOpcode(MCOp)) 7636 return -1; 7637 7638 return MCOp; 7639 } 7640 7641 static 7642 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 7643 assert(RegOpnd.isReg()); 7644 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 7645 getRegSubRegPair(RegOpnd); 7646 } 7647 7648 TargetInstrInfo::RegSubRegPair 7649 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 7650 assert(MI.isRegSequence()); 7651 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 7652 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 7653 auto &RegOp = MI.getOperand(1 + 2 * I); 7654 return getRegOrUndef(RegOp); 7655 } 7656 return TargetInstrInfo::RegSubRegPair(); 7657 } 7658 7659 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 7660 // Following a subreg of reg:subreg isn't supported 7661 static bool followSubRegDef(MachineInstr &MI, 7662 TargetInstrInfo::RegSubRegPair &RSR) { 7663 if (!RSR.SubReg) 7664 return false; 7665 switch (MI.getOpcode()) { 7666 default: break; 7667 case AMDGPU::REG_SEQUENCE: 7668 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 7669 return true; 7670 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 7671 case AMDGPU::INSERT_SUBREG: 7672 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 7673 // inserted the subreg we're looking for 7674 RSR = getRegOrUndef(MI.getOperand(2)); 7675 else { // the subreg in the rest of the reg 7676 auto R1 = getRegOrUndef(MI.getOperand(1)); 7677 if (R1.SubReg) // subreg of subreg isn't supported 7678 return false; 7679 RSR.Reg = R1.Reg; 7680 } 7681 return true; 7682 } 7683 return false; 7684 } 7685 7686 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 7687 MachineRegisterInfo &MRI) { 7688 assert(MRI.isSSA()); 7689 if (!P.Reg.isVirtual()) 7690 return nullptr; 7691 7692 auto RSR = P; 7693 auto *DefInst = MRI.getVRegDef(RSR.Reg); 7694 while (auto *MI = DefInst) { 7695 DefInst = nullptr; 7696 switch (MI->getOpcode()) { 7697 case AMDGPU::COPY: 7698 case AMDGPU::V_MOV_B32_e32: { 7699 auto &Op1 = MI->getOperand(1); 7700 if (Op1.isReg() && Op1.getReg().isVirtual()) { 7701 if (Op1.isUndef()) 7702 return nullptr; 7703 RSR = getRegSubRegPair(Op1); 7704 DefInst = MRI.getVRegDef(RSR.Reg); 7705 } 7706 break; 7707 } 7708 default: 7709 if (followSubRegDef(*MI, RSR)) { 7710 if (!RSR.Reg) 7711 return nullptr; 7712 DefInst = MRI.getVRegDef(RSR.Reg); 7713 } 7714 } 7715 if (!DefInst) 7716 return MI; 7717 } 7718 return nullptr; 7719 } 7720 7721 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 7722 Register VReg, 7723 const MachineInstr &DefMI, 7724 const MachineInstr &UseMI) { 7725 assert(MRI.isSSA() && "Must be run on SSA"); 7726 7727 auto *TRI = MRI.getTargetRegisterInfo(); 7728 auto *DefBB = DefMI.getParent(); 7729 7730 // Don't bother searching between blocks, although it is possible this block 7731 // doesn't modify exec. 7732 if (UseMI.getParent() != DefBB) 7733 return true; 7734 7735 const int MaxInstScan = 20; 7736 int NumInst = 0; 7737 7738 // Stop scan at the use. 7739 auto E = UseMI.getIterator(); 7740 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 7741 if (I->isDebugInstr()) 7742 continue; 7743 7744 if (++NumInst > MaxInstScan) 7745 return true; 7746 7747 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 7748 return true; 7749 } 7750 7751 return false; 7752 } 7753 7754 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 7755 Register VReg, 7756 const MachineInstr &DefMI) { 7757 assert(MRI.isSSA() && "Must be run on SSA"); 7758 7759 auto *TRI = MRI.getTargetRegisterInfo(); 7760 auto *DefBB = DefMI.getParent(); 7761 7762 const int MaxUseScan = 10; 7763 int NumUse = 0; 7764 7765 for (auto &Use : MRI.use_nodbg_operands(VReg)) { 7766 auto &UseInst = *Use.getParent(); 7767 // Don't bother searching between blocks, although it is possible this block 7768 // doesn't modify exec. 7769 if (UseInst.getParent() != DefBB) 7770 return true; 7771 7772 if (++NumUse > MaxUseScan) 7773 return true; 7774 } 7775 7776 if (NumUse == 0) 7777 return false; 7778 7779 const int MaxInstScan = 20; 7780 int NumInst = 0; 7781 7782 // Stop scan when we have seen all the uses. 7783 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 7784 assert(I != DefBB->end()); 7785 7786 if (I->isDebugInstr()) 7787 continue; 7788 7789 if (++NumInst > MaxInstScan) 7790 return true; 7791 7792 for (const MachineOperand &Op : I->operands()) { 7793 // We don't check reg masks here as they're used only on calls: 7794 // 1. EXEC is only considered const within one BB 7795 // 2. Call should be a terminator instruction if present in a BB 7796 7797 if (!Op.isReg()) 7798 continue; 7799 7800 Register Reg = Op.getReg(); 7801 if (Op.isUse()) { 7802 if (Reg == VReg && --NumUse == 0) 7803 return false; 7804 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC)) 7805 return true; 7806 } 7807 } 7808 } 7809 7810 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 7811 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 7812 const DebugLoc &DL, Register Src, Register Dst) const { 7813 auto Cur = MBB.begin(); 7814 if (Cur != MBB.end()) 7815 do { 7816 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 7817 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 7818 ++Cur; 7819 } while (Cur != MBB.end() && Cur != LastPHIIt); 7820 7821 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 7822 Dst); 7823 } 7824 7825 MachineInstr *SIInstrInfo::createPHISourceCopy( 7826 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 7827 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 7828 if (InsPt != MBB.end() && 7829 (InsPt->getOpcode() == AMDGPU::SI_IF || 7830 InsPt->getOpcode() == AMDGPU::SI_ELSE || 7831 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 7832 InsPt->definesRegister(Src)) { 7833 InsPt++; 7834 return BuildMI(MBB, InsPt, DL, 7835 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 7836 : AMDGPU::S_MOV_B64_term), 7837 Dst) 7838 .addReg(Src, 0, SrcSubReg) 7839 .addReg(AMDGPU::EXEC, RegState::Implicit); 7840 } 7841 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 7842 Dst); 7843 } 7844 7845 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 7846 7847 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 7848 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 7849 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 7850 VirtRegMap *VRM) const { 7851 // This is a bit of a hack (copied from AArch64). Consider this instruction: 7852 // 7853 // %0:sreg_32 = COPY $m0 7854 // 7855 // We explicitly chose SReg_32 for the virtual register so such a copy might 7856 // be eliminated by RegisterCoalescer. However, that may not be possible, and 7857 // %0 may even spill. We can't spill $m0 normally (it would require copying to 7858 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 7859 // TargetInstrInfo::foldMemoryOperand() is going to try. 7860 // A similar issue also exists with spilling and reloading $exec registers. 7861 // 7862 // To prevent that, constrain the %0 register class here. 7863 if (MI.isFullCopy()) { 7864 Register DstReg = MI.getOperand(0).getReg(); 7865 Register SrcReg = MI.getOperand(1).getReg(); 7866 if ((DstReg.isVirtual() || SrcReg.isVirtual()) && 7867 (DstReg.isVirtual() != SrcReg.isVirtual())) { 7868 MachineRegisterInfo &MRI = MF.getRegInfo(); 7869 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; 7870 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); 7871 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { 7872 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 7873 return nullptr; 7874 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { 7875 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); 7876 return nullptr; 7877 } 7878 } 7879 } 7880 7881 return nullptr; 7882 } 7883 7884 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 7885 const MachineInstr &MI, 7886 unsigned *PredCost) const { 7887 if (MI.isBundle()) { 7888 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 7889 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 7890 unsigned Lat = 0, Count = 0; 7891 for (++I; I != E && I->isBundledWithPred(); ++I) { 7892 ++Count; 7893 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 7894 } 7895 return Lat + Count - 1; 7896 } 7897 7898 return SchedModel.computeInstrLatency(&MI); 7899 } 7900 7901 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) { 7902 switch (MF.getFunction().getCallingConv()) { 7903 case CallingConv::AMDGPU_PS: 7904 return 1; 7905 case CallingConv::AMDGPU_VS: 7906 return 2; 7907 case CallingConv::AMDGPU_GS: 7908 return 3; 7909 case CallingConv::AMDGPU_HS: 7910 case CallingConv::AMDGPU_LS: 7911 case CallingConv::AMDGPU_ES: 7912 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 7913 case CallingConv::AMDGPU_CS: 7914 case CallingConv::AMDGPU_KERNEL: 7915 case CallingConv::C: 7916 case CallingConv::Fast: 7917 default: 7918 // Assume other calling conventions are various compute callable functions 7919 return 0; 7920 } 7921 } 7922