1 //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// SI Implementation of TargetInstrInfo. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "SIInstrInfo.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUInstrInfo.h" 17 #include "GCNHazardRecognizer.h" 18 #include "GCNSubtarget.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/Analysis/ValueTracking.h" 21 #include "llvm/CodeGen/LiveIntervals.h" 22 #include "llvm/CodeGen/LiveVariables.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/RegisterScavenging.h" 27 #include "llvm/CodeGen/ScheduleDAG.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/IntrinsicsAMDGPU.h" 30 #include "llvm/MC/MCContext.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Target/TargetMachine.h" 33 34 using namespace llvm; 35 36 #define DEBUG_TYPE "si-instr-info" 37 38 #define GET_INSTRINFO_CTOR_DTOR 39 #include "AMDGPUGenInstrInfo.inc" 40 41 namespace llvm { 42 namespace AMDGPU { 43 #define GET_D16ImageDimIntrinsics_IMPL 44 #define GET_ImageDimIntrinsicTable_IMPL 45 #define GET_RsrcIntrinsics_IMPL 46 #include "AMDGPUGenSearchableTables.inc" 47 } 48 } 49 50 51 // Must be at least 4 to be able to branch over minimum unconditional branch 52 // code. This is only for making it possible to write reasonably small tests for 53 // long branches. 54 static cl::opt<unsigned> 55 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), 56 cl::desc("Restrict range of branch instructions (DEBUG)")); 57 58 static cl::opt<bool> Fix16BitCopies( 59 "amdgpu-fix-16-bit-physreg-copies", 60 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), 61 cl::init(true), 62 cl::ReallyHidden); 63 64 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) 65 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), 66 RI(ST), ST(ST) { 67 SchedModel.init(&ST); 68 } 69 70 //===----------------------------------------------------------------------===// 71 // TargetInstrInfo callbacks 72 //===----------------------------------------------------------------------===// 73 74 static unsigned getNumOperandsNoGlue(SDNode *Node) { 75 unsigned N = Node->getNumOperands(); 76 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) 77 --N; 78 return N; 79 } 80 81 /// Returns true if both nodes have the same value for the given 82 /// operand \p Op, or if both nodes do not have this operand. 83 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { 84 unsigned Opc0 = N0->getMachineOpcode(); 85 unsigned Opc1 = N1->getMachineOpcode(); 86 87 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); 88 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); 89 90 if (Op0Idx == -1 && Op1Idx == -1) 91 return true; 92 93 94 if ((Op0Idx == -1 && Op1Idx != -1) || 95 (Op1Idx == -1 && Op0Idx != -1)) 96 return false; 97 98 // getNamedOperandIdx returns the index for the MachineInstr's operands, 99 // which includes the result as the first operand. We are indexing into the 100 // MachineSDNode's operands, so we need to skip the result operand to get 101 // the real index. 102 --Op0Idx; 103 --Op1Idx; 104 105 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); 106 } 107 108 bool SIInstrInfo::isReallyTriviallyReMaterializable( 109 const MachineInstr &MI) const { 110 if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { 111 // Normally VALU use of exec would block the rematerialization, but that 112 // is OK in this case to have an implicit exec read as all VALU do. 113 // We really want all of the generic logic for this except for this. 114 115 // Another potential implicit use is mode register. The core logic of 116 // the RA will not attempt rematerialization if mode is set anywhere 117 // in the function, otherwise it is safe since mode is not changed. 118 119 // There is difference to generic method which does not allow 120 // rematerialization if there are virtual register uses. We allow this, 121 // therefore this method includes SOP instructions as well. 122 return !MI.hasImplicitDef() && 123 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && 124 !MI.mayRaiseFPException(); 125 } 126 127 return false; 128 } 129 130 // Returns true if the scalar result of a VALU instruction depends on exec. 131 static bool resultDependsOnExec(const MachineInstr &MI) { 132 // Ignore comparisons which are only used masked with exec. 133 // This allows some hoisting/sinking of VALU comparisons. 134 if (MI.isCompare()) { 135 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 136 Register DstReg = MI.getOperand(0).getReg(); 137 if (!DstReg.isVirtual()) 138 return true; 139 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) { 140 switch (Use.getOpcode()) { 141 case AMDGPU::S_AND_SAVEEXEC_B32: 142 case AMDGPU::S_AND_SAVEEXEC_B64: 143 break; 144 case AMDGPU::S_AND_B32: 145 case AMDGPU::S_AND_B64: 146 if (!Use.readsRegister(AMDGPU::EXEC)) 147 return true; 148 break; 149 default: 150 return true; 151 } 152 } 153 return false; 154 } 155 156 switch (MI.getOpcode()) { 157 default: 158 break; 159 case AMDGPU::V_READFIRSTLANE_B32: 160 return true; 161 } 162 163 return false; 164 } 165 166 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { 167 // Any implicit use of exec by VALU is not a real register read. 168 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && 169 isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); 170 } 171 172 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, 173 int64_t &Offset0, 174 int64_t &Offset1) const { 175 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) 176 return false; 177 178 unsigned Opc0 = Load0->getMachineOpcode(); 179 unsigned Opc1 = Load1->getMachineOpcode(); 180 181 // Make sure both are actually loads. 182 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) 183 return false; 184 185 if (isDS(Opc0) && isDS(Opc1)) { 186 187 // FIXME: Handle this case: 188 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) 189 return false; 190 191 // Check base reg. 192 if (Load0->getOperand(0) != Load1->getOperand(0)) 193 return false; 194 195 // Skip read2 / write2 variants for simplicity. 196 // TODO: We should report true if the used offsets are adjacent (excluded 197 // st64 versions). 198 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 199 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 200 if (Offset0Idx == -1 || Offset1Idx == -1) 201 return false; 202 203 // XXX - be careful of dataless loads 204 // getNamedOperandIdx returns the index for MachineInstrs. Since they 205 // include the output in the operand list, but SDNodes don't, we need to 206 // subtract the index by one. 207 Offset0Idx -= get(Opc0).NumDefs; 208 Offset1Idx -= get(Opc1).NumDefs; 209 Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); 210 Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); 211 return true; 212 } 213 214 if (isSMRD(Opc0) && isSMRD(Opc1)) { 215 // Skip time and cache invalidation instructions. 216 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) || 217 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase)) 218 return false; 219 220 unsigned NumOps = getNumOperandsNoGlue(Load0); 221 if (NumOps != getNumOperandsNoGlue(Load1)) 222 return false; 223 224 // Check base reg. 225 if (Load0->getOperand(0) != Load1->getOperand(0)) 226 return false; 227 228 // Match register offsets, if both register and immediate offsets present. 229 assert(NumOps == 4 || NumOps == 5); 230 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1)) 231 return false; 232 233 const ConstantSDNode *Load0Offset = 234 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3)); 235 const ConstantSDNode *Load1Offset = 236 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3)); 237 238 if (!Load0Offset || !Load1Offset) 239 return false; 240 241 Offset0 = Load0Offset->getZExtValue(); 242 Offset1 = Load1Offset->getZExtValue(); 243 return true; 244 } 245 246 // MUBUF and MTBUF can access the same addresses. 247 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { 248 249 // MUBUF and MTBUF have vaddr at different indices. 250 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || 251 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || 252 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) 253 return false; 254 255 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); 256 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); 257 258 if (OffIdx0 == -1 || OffIdx1 == -1) 259 return false; 260 261 // getNamedOperandIdx returns the index for MachineInstrs. Since they 262 // include the output in the operand list, but SDNodes don't, we need to 263 // subtract the index by one. 264 OffIdx0 -= get(Opc0).NumDefs; 265 OffIdx1 -= get(Opc1).NumDefs; 266 267 SDValue Off0 = Load0->getOperand(OffIdx0); 268 SDValue Off1 = Load1->getOperand(OffIdx1); 269 270 // The offset might be a FrameIndexSDNode. 271 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) 272 return false; 273 274 Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); 275 Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); 276 return true; 277 } 278 279 return false; 280 } 281 282 static bool isStride64(unsigned Opc) { 283 switch (Opc) { 284 case AMDGPU::DS_READ2ST64_B32: 285 case AMDGPU::DS_READ2ST64_B64: 286 case AMDGPU::DS_WRITE2ST64_B32: 287 case AMDGPU::DS_WRITE2ST64_B64: 288 return true; 289 default: 290 return false; 291 } 292 } 293 294 bool SIInstrInfo::getMemOperandsWithOffsetWidth( 295 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 296 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 297 const TargetRegisterInfo *TRI) const { 298 if (!LdSt.mayLoadOrStore()) 299 return false; 300 301 unsigned Opc = LdSt.getOpcode(); 302 OffsetIsScalable = false; 303 const MachineOperand *BaseOp, *OffsetOp; 304 int DataOpIdx; 305 306 if (isDS(LdSt)) { 307 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); 308 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 309 if (OffsetOp) { 310 // Normal, single offset LDS instruction. 311 if (!BaseOp) { 312 // DS_CONSUME/DS_APPEND use M0 for the base address. 313 // TODO: find the implicit use operand for M0 and use that as BaseOp? 314 return false; 315 } 316 BaseOps.push_back(BaseOp); 317 Offset = OffsetOp->getImm(); 318 // Get appropriate operand, and compute width accordingly. 319 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 320 if (DataOpIdx == -1) 321 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 322 Width = getOpSize(LdSt, DataOpIdx); 323 } else { 324 // The 2 offset instructions use offset0 and offset1 instead. We can treat 325 // these as a load with a single offset if the 2 offsets are consecutive. 326 // We will use this for some partially aligned loads. 327 const MachineOperand *Offset0Op = 328 getNamedOperand(LdSt, AMDGPU::OpName::offset0); 329 const MachineOperand *Offset1Op = 330 getNamedOperand(LdSt, AMDGPU::OpName::offset1); 331 332 unsigned Offset0 = Offset0Op->getImm() & 0xff; 333 unsigned Offset1 = Offset1Op->getImm() & 0xff; 334 if (Offset0 + 1 != Offset1) 335 return false; 336 337 // Each of these offsets is in element sized units, so we need to convert 338 // to bytes of the individual reads. 339 340 unsigned EltSize; 341 if (LdSt.mayLoad()) 342 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; 343 else { 344 assert(LdSt.mayStore()); 345 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 346 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; 347 } 348 349 if (isStride64(Opc)) 350 EltSize *= 64; 351 352 BaseOps.push_back(BaseOp); 353 Offset = EltSize * Offset0; 354 // Get appropriate operand(s), and compute width accordingly. 355 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 356 if (DataOpIdx == -1) { 357 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); 358 Width = getOpSize(LdSt, DataOpIdx); 359 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); 360 Width += getOpSize(LdSt, DataOpIdx); 361 } else { 362 Width = getOpSize(LdSt, DataOpIdx); 363 } 364 } 365 return true; 366 } 367 368 if (isMUBUF(LdSt) || isMTBUF(LdSt)) { 369 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); 370 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL 371 return false; 372 BaseOps.push_back(RSrc); 373 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 374 if (BaseOp && !BaseOp->isFI()) 375 BaseOps.push_back(BaseOp); 376 const MachineOperand *OffsetImm = 377 getNamedOperand(LdSt, AMDGPU::OpName::offset); 378 Offset = OffsetImm->getImm(); 379 const MachineOperand *SOffset = 380 getNamedOperand(LdSt, AMDGPU::OpName::soffset); 381 if (SOffset) { 382 if (SOffset->isReg()) 383 BaseOps.push_back(SOffset); 384 else 385 Offset += SOffset->getImm(); 386 } 387 // Get appropriate operand, and compute width accordingly. 388 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 389 if (DataOpIdx == -1) 390 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 391 if (DataOpIdx == -1) // LDS DMA 392 return false; 393 Width = getOpSize(LdSt, DataOpIdx); 394 return true; 395 } 396 397 if (isMIMG(LdSt)) { 398 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 399 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); 400 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 401 if (VAddr0Idx >= 0) { 402 // GFX10 possible NSA encoding. 403 for (int I = VAddr0Idx; I < SRsrcIdx; ++I) 404 BaseOps.push_back(&LdSt.getOperand(I)); 405 } else { 406 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); 407 } 408 Offset = 0; 409 // Get appropriate operand, and compute width accordingly. 410 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 411 Width = getOpSize(LdSt, DataOpIdx); 412 return true; 413 } 414 415 if (isSMRD(LdSt)) { 416 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); 417 if (!BaseOp) // e.g. S_MEMTIME 418 return false; 419 BaseOps.push_back(BaseOp); 420 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); 421 Offset = OffsetOp ? OffsetOp->getImm() : 0; 422 // Get appropriate operand, and compute width accordingly. 423 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); 424 Width = getOpSize(LdSt, DataOpIdx); 425 return true; 426 } 427 428 if (isFLAT(LdSt)) { 429 // Instructions have either vaddr or saddr or both or none. 430 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); 431 if (BaseOp) 432 BaseOps.push_back(BaseOp); 433 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); 434 if (BaseOp) 435 BaseOps.push_back(BaseOp); 436 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); 437 // Get appropriate operand, and compute width accordingly. 438 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 439 if (DataOpIdx == -1) 440 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); 441 if (DataOpIdx == -1) // LDS DMA 442 return false; 443 Width = getOpSize(LdSt, DataOpIdx); 444 return true; 445 } 446 447 return false; 448 } 449 450 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, 451 ArrayRef<const MachineOperand *> BaseOps1, 452 const MachineInstr &MI2, 453 ArrayRef<const MachineOperand *> BaseOps2) { 454 // Only examine the first "base" operand of each instruction, on the 455 // assumption that it represents the real base address of the memory access. 456 // Other operands are typically offsets or indices from this base address. 457 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) 458 return true; 459 460 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) 461 return false; 462 463 auto MO1 = *MI1.memoperands_begin(); 464 auto MO2 = *MI2.memoperands_begin(); 465 if (MO1->getAddrSpace() != MO2->getAddrSpace()) 466 return false; 467 468 auto Base1 = MO1->getValue(); 469 auto Base2 = MO2->getValue(); 470 if (!Base1 || !Base2) 471 return false; 472 Base1 = getUnderlyingObject(Base1); 473 Base2 = getUnderlyingObject(Base2); 474 475 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) 476 return false; 477 478 return Base1 == Base2; 479 } 480 481 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, 482 ArrayRef<const MachineOperand *> BaseOps2, 483 unsigned NumLoads, 484 unsigned NumBytes) const { 485 // If the mem ops (to be clustered) do not have the same base ptr, then they 486 // should not be clustered 487 if (!BaseOps1.empty() && !BaseOps2.empty()) { 488 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); 489 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); 490 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) 491 return false; 492 } else if (!BaseOps1.empty() || !BaseOps2.empty()) { 493 // If only one base op is empty, they do not have the same base ptr 494 return false; 495 } 496 497 // In order to avoid register pressure, on an average, the number of DWORDS 498 // loaded together by all clustered mem ops should not exceed 8. This is an 499 // empirical value based on certain observations and performance related 500 // experiments. 501 // The good thing about this heuristic is - it avoids clustering of too many 502 // sub-word loads, and also avoids clustering of wide loads. Below is the 503 // brief summary of how the heuristic behaves for various `LoadSize`. 504 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops 505 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops 506 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops 507 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops 508 // (5) LoadSize >= 17: do not cluster 509 const unsigned LoadSize = NumBytes / NumLoads; 510 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; 511 return NumDWORDs <= 8; 512 } 513 514 // FIXME: This behaves strangely. If, for example, you have 32 load + stores, 515 // the first 16 loads will be interleaved with the stores, and the next 16 will 516 // be clustered as expected. It should really split into 2 16 store batches. 517 // 518 // Loads are clustered until this returns false, rather than trying to schedule 519 // groups of stores. This also means we have to deal with saying different 520 // address space loads should be clustered, and ones which might cause bank 521 // conflicts. 522 // 523 // This might be deprecated so it might not be worth that much effort to fix. 524 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, 525 int64_t Offset0, int64_t Offset1, 526 unsigned NumLoads) const { 527 assert(Offset1 > Offset0 && 528 "Second offset should be larger than first offset!"); 529 // If we have less than 16 loads in a row, and the offsets are within 64 530 // bytes, then schedule together. 531 532 // A cacheline is 64 bytes (for global memory). 533 return (NumLoads <= 16 && (Offset1 - Offset0) < 64); 534 } 535 536 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, 537 MachineBasicBlock::iterator MI, 538 const DebugLoc &DL, MCRegister DestReg, 539 MCRegister SrcReg, bool KillSrc, 540 const char *Msg = "illegal VGPR to SGPR copy") { 541 MachineFunction *MF = MBB.getParent(); 542 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); 543 LLVMContext &C = MF->getFunction().getContext(); 544 C.diagnose(IllegalCopy); 545 546 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) 547 .addReg(SrcReg, getKillRegState(KillSrc)); 548 } 549 550 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not 551 /// possible to have a direct copy in these cases on GFX908, so an intermediate 552 /// VGPR copy is required. 553 static void indirectCopyToAGPR(const SIInstrInfo &TII, 554 MachineBasicBlock &MBB, 555 MachineBasicBlock::iterator MI, 556 const DebugLoc &DL, MCRegister DestReg, 557 MCRegister SrcReg, bool KillSrc, 558 RegScavenger &RS, bool RegsOverlap, 559 Register ImpDefSuperReg = Register(), 560 Register ImpUseSuperReg = Register()) { 561 assert((TII.getSubtarget().hasMAIInsts() && 562 !TII.getSubtarget().hasGFX90AInsts()) && 563 "Expected GFX908 subtarget."); 564 565 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) || 566 AMDGPU::AGPR_32RegClass.contains(SrcReg)) && 567 "Source register of the copy should be either an SGPR or an AGPR."); 568 569 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) && 570 "Destination register of the copy should be an AGPR."); 571 572 const SIRegisterInfo &RI = TII.getRegisterInfo(); 573 574 // First try to find defining accvgpr_write to avoid temporary registers. 575 // In the case of copies of overlapping AGPRs, we conservatively do not 576 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up 577 // an accvgpr_write used for this same copy due to implicit-defs 578 if (!RegsOverlap) { 579 for (auto Def = MI, E = MBB.begin(); Def != E; ) { 580 --Def; 581 582 if (!Def->modifiesRegister(SrcReg, &RI)) 583 continue; 584 585 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 586 Def->getOperand(0).getReg() != SrcReg) 587 break; 588 589 MachineOperand &DefOp = Def->getOperand(1); 590 assert(DefOp.isReg() || DefOp.isImm()); 591 592 if (DefOp.isReg()) { 593 bool SafeToPropagate = true; 594 // Check that register source operand is not clobbered before MI. 595 // Immediate operands are always safe to propagate. 596 for (auto I = Def; I != MI && SafeToPropagate; ++I) 597 if (I->modifiesRegister(DefOp.getReg(), &RI)) 598 SafeToPropagate = false; 599 600 if (!SafeToPropagate) 601 break; 602 603 DefOp.setIsKill(false); 604 } 605 606 MachineInstrBuilder Builder = 607 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 608 .add(DefOp); 609 if (ImpDefSuperReg) 610 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 611 612 if (ImpUseSuperReg) { 613 Builder.addReg(ImpUseSuperReg, 614 getKillRegState(KillSrc) | RegState::Implicit); 615 } 616 617 return; 618 } 619 } 620 621 RS.enterBasicBlockEnd(MBB); 622 RS.backward(MI); 623 624 // Ideally we want to have three registers for a long reg_sequence copy 625 // to hide 2 waitstates between v_mov_b32 and accvgpr_write. 626 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, 627 *MBB.getParent()); 628 629 // Registers in the sequence are allocated contiguously so we can just 630 // use register number to pick one of three round-robin temps. 631 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3; 632 Register Tmp = 633 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy(); 634 assert(MBB.getParent()->getRegInfo().isReserved(Tmp) && 635 "VGPR used for an intermediate copy should have been reserved."); 636 637 // Only loop through if there are any free registers left. We don't want to 638 // spill. 639 while (RegNo--) { 640 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, 641 /* RestoreAfter */ false, 0, 642 /* AllowSpill */ false); 643 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) 644 break; 645 Tmp = Tmp2; 646 RS.setRegUsed(Tmp); 647 } 648 649 // Insert copy to temporary VGPR. 650 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; 651 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { 652 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; 653 } else { 654 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 655 } 656 657 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) 658 .addReg(SrcReg, getKillRegState(KillSrc)); 659 if (ImpUseSuperReg) { 660 UseBuilder.addReg(ImpUseSuperReg, 661 getKillRegState(KillSrc) | RegState::Implicit); 662 } 663 664 MachineInstrBuilder DefBuilder 665 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 666 .addReg(Tmp, RegState::Kill); 667 668 if (ImpDefSuperReg) 669 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); 670 } 671 672 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, 673 MachineBasicBlock::iterator MI, const DebugLoc &DL, 674 MCRegister DestReg, MCRegister SrcReg, bool KillSrc, 675 const TargetRegisterClass *RC, bool Forward) { 676 const SIRegisterInfo &RI = TII.getRegisterInfo(); 677 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); 678 MachineBasicBlock::iterator I = MI; 679 MachineInstr *FirstMI = nullptr, *LastMI = nullptr; 680 681 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { 682 int16_t SubIdx = BaseIndices[Idx]; 683 Register Reg = RI.getSubReg(DestReg, SubIdx); 684 unsigned Opcode = AMDGPU::S_MOV_B32; 685 686 // Is SGPR aligned? If so try to combine with next. 687 Register Src = RI.getSubReg(SrcReg, SubIdx); 688 bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; 689 bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; 690 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { 691 // Can use SGPR64 copy 692 unsigned Channel = RI.getChannelFromSubReg(SubIdx); 693 SubIdx = RI.getSubRegFromChannel(Channel, 2); 694 Opcode = AMDGPU::S_MOV_B64; 695 Idx++; 696 } 697 698 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) 699 .addReg(RI.getSubReg(SrcReg, SubIdx)) 700 .addReg(SrcReg, RegState::Implicit); 701 702 if (!FirstMI) 703 FirstMI = LastMI; 704 705 if (!Forward) 706 I--; 707 } 708 709 assert(FirstMI && LastMI); 710 if (!Forward) 711 std::swap(FirstMI, LastMI); 712 713 FirstMI->addOperand( 714 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); 715 716 if (KillSrc) 717 LastMI->addRegisterKilled(SrcReg, &RI); 718 } 719 720 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, 721 MachineBasicBlock::iterator MI, 722 const DebugLoc &DL, MCRegister DestReg, 723 MCRegister SrcReg, bool KillSrc) const { 724 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg); 725 726 // FIXME: This is hack to resolve copies between 16 bit and 32 bit 727 // registers until all patterns are fixed. 728 if (Fix16BitCopies && 729 ((RI.getRegSizeInBits(*RC) == 16) ^ 730 (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) { 731 MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; 732 MCRegister Super = RI.get32BitRegister(RegToFix); 733 assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); 734 RegToFix = Super; 735 736 if (DestReg == SrcReg) { 737 // Insert empty bundle since ExpandPostRA expects an instruction here. 738 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); 739 return; 740 } 741 742 RC = RI.getPhysRegBaseClass(DestReg); 743 } 744 745 if (RC == &AMDGPU::VGPR_32RegClass) { 746 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || 747 AMDGPU::SReg_32RegClass.contains(SrcReg) || 748 AMDGPU::AGPR_32RegClass.contains(SrcReg)); 749 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? 750 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; 751 BuildMI(MBB, MI, DL, get(Opc), DestReg) 752 .addReg(SrcReg, getKillRegState(KillSrc)); 753 return; 754 } 755 756 if (RC == &AMDGPU::SReg_32_XM0RegClass || 757 RC == &AMDGPU::SReg_32RegClass) { 758 if (SrcReg == AMDGPU::SCC) { 759 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) 760 .addImm(1) 761 .addImm(0); 762 return; 763 } 764 765 if (DestReg == AMDGPU::VCC_LO) { 766 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { 767 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) 768 .addReg(SrcReg, getKillRegState(KillSrc)); 769 } else { 770 // FIXME: Hack until VReg_1 removed. 771 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 772 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 773 .addImm(0) 774 .addReg(SrcReg, getKillRegState(KillSrc)); 775 } 776 777 return; 778 } 779 780 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { 781 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 782 return; 783 } 784 785 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 786 .addReg(SrcReg, getKillRegState(KillSrc)); 787 return; 788 } 789 790 if (RC == &AMDGPU::SReg_64RegClass) { 791 if (SrcReg == AMDGPU::SCC) { 792 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) 793 .addImm(1) 794 .addImm(0); 795 return; 796 } 797 798 if (DestReg == AMDGPU::VCC) { 799 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 800 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) 801 .addReg(SrcReg, getKillRegState(KillSrc)); 802 } else { 803 // FIXME: Hack until VReg_1 removed. 804 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); 805 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) 806 .addImm(0) 807 .addReg(SrcReg, getKillRegState(KillSrc)); 808 } 809 810 return; 811 } 812 813 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { 814 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 815 return; 816 } 817 818 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 819 .addReg(SrcReg, getKillRegState(KillSrc)); 820 return; 821 } 822 823 if (DestReg == AMDGPU::SCC) { 824 // Copying 64-bit or 32-bit sources to SCC barely makes sense, 825 // but SelectionDAG emits such copies for i1 sources. 826 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { 827 // This copy can only be produced by patterns 828 // with explicit SCC, which are known to be enabled 829 // only for subtargets with S_CMP_LG_U64 present. 830 assert(ST.hasScalarCompareEq64()); 831 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) 832 .addReg(SrcReg, getKillRegState(KillSrc)) 833 .addImm(0); 834 } else { 835 assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); 836 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) 837 .addReg(SrcReg, getKillRegState(KillSrc)) 838 .addImm(0); 839 } 840 841 return; 842 } 843 844 if (RC == &AMDGPU::AGPR_32RegClass) { 845 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) || 846 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) { 847 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) 848 .addReg(SrcReg, getKillRegState(KillSrc)); 849 return; 850 } 851 852 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { 853 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) 854 .addReg(SrcReg, getKillRegState(KillSrc)); 855 return; 856 } 857 858 // FIXME: Pass should maintain scavenger to avoid scan through the block on 859 // every AGPR spill. 860 RegScavenger RS; 861 const bool Overlap = RI.regsOverlap(SrcReg, DestReg); 862 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap); 863 return; 864 } 865 866 const unsigned Size = RI.getRegSizeInBits(*RC); 867 if (Size == 16) { 868 assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 869 AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || 870 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 871 AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); 872 873 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); 874 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); 875 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); 876 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 877 bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || 878 AMDGPU::SReg_LO16RegClass.contains(DestReg) || 879 AMDGPU::AGPR_LO16RegClass.contains(DestReg); 880 bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || 881 AMDGPU::SReg_LO16RegClass.contains(SrcReg) || 882 AMDGPU::AGPR_LO16RegClass.contains(SrcReg); 883 MCRegister NewDestReg = RI.get32BitRegister(DestReg); 884 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); 885 886 if (IsSGPRDst) { 887 if (!IsSGPRSrc) { 888 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 889 return; 890 } 891 892 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) 893 .addReg(NewSrcReg, getKillRegState(KillSrc)); 894 return; 895 } 896 897 if (IsAGPRDst || IsAGPRSrc) { 898 if (!DstLow || !SrcLow) { 899 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 900 "Cannot use hi16 subreg with an AGPR!"); 901 } 902 903 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); 904 return; 905 } 906 907 if (IsSGPRSrc && !ST.hasSDWAScalar()) { 908 if (!DstLow || !SrcLow) { 909 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, 910 "Cannot use hi16 subreg on VI!"); 911 } 912 913 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) 914 .addReg(NewSrcReg, getKillRegState(KillSrc)); 915 return; 916 } 917 918 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) 919 .addImm(0) // src0_modifiers 920 .addReg(NewSrcReg) 921 .addImm(0) // clamp 922 .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 923 : AMDGPU::SDWA::SdwaSel::WORD_1) 924 .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) 925 .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 926 : AMDGPU::SDWA::SdwaSel::WORD_1) 927 .addReg(NewDestReg, RegState::Implicit | RegState::Undef); 928 // First implicit operand is $exec. 929 MIB->tieOperands(0, MIB->getNumOperands() - 1); 930 return; 931 } 932 933 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); 934 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { 935 if (ST.hasMovB64()) { 936 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) 937 .addReg(SrcReg, getKillRegState(KillSrc)); 938 return; 939 } 940 if (ST.hasPackedFP32Ops()) { 941 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) 942 .addImm(SISrcMods::OP_SEL_1) 943 .addReg(SrcReg) 944 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 945 .addReg(SrcReg) 946 .addImm(0) // op_sel_lo 947 .addImm(0) // op_sel_hi 948 .addImm(0) // neg_lo 949 .addImm(0) // neg_hi 950 .addImm(0) // clamp 951 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); 952 return; 953 } 954 } 955 956 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); 957 if (RI.isSGPRClass(RC)) { 958 if (!RI.isSGPRClass(SrcRC)) { 959 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); 960 return; 961 } 962 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); 963 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC, 964 Forward); 965 return; 966 } 967 968 unsigned EltSize = 4; 969 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 970 if (RI.isAGPRClass(RC)) { 971 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) 972 Opcode = AMDGPU::V_ACCVGPR_MOV_B32; 973 else if (RI.hasVGPRs(SrcRC) || 974 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC))) 975 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 976 else 977 Opcode = AMDGPU::INSTRUCTION_LIST_END; 978 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { 979 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; 980 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && 981 (RI.isProperlyAlignedRC(*RC) && 982 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { 983 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. 984 if (ST.hasMovB64()) { 985 Opcode = AMDGPU::V_MOV_B64_e32; 986 EltSize = 8; 987 } else if (ST.hasPackedFP32Ops()) { 988 Opcode = AMDGPU::V_PK_MOV_B32; 989 EltSize = 8; 990 } 991 } 992 993 // For the cases where we need an intermediate instruction/temporary register 994 // (destination is an AGPR), we need a scavenger. 995 // 996 // FIXME: The pass should maintain this for us so we don't have to re-scan the 997 // whole block for every handled copy. 998 std::unique_ptr<RegScavenger> RS; 999 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) 1000 RS.reset(new RegScavenger()); 1001 1002 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); 1003 1004 // If there is an overlap, we can't kill the super-register on the last 1005 // instruction, since it will also kill the components made live by this def. 1006 const bool Overlap = RI.regsOverlap(SrcReg, DestReg); 1007 const bool CanKillSuperReg = KillSrc && !Overlap; 1008 1009 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 1010 unsigned SubIdx; 1011 if (Forward) 1012 SubIdx = SubIndices[Idx]; 1013 else 1014 SubIdx = SubIndices[SubIndices.size() - Idx - 1]; 1015 1016 bool IsFirstSubreg = Idx == 0; 1017 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; 1018 1019 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { 1020 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register(); 1021 Register ImpUseSuper = SrcReg; 1022 indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), 1023 RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap, 1024 ImpDefSuper, ImpUseSuper); 1025 } else if (Opcode == AMDGPU::V_PK_MOV_B32) { 1026 Register DstSubReg = RI.getSubReg(DestReg, SubIdx); 1027 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); 1028 MachineInstrBuilder MIB = 1029 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) 1030 .addImm(SISrcMods::OP_SEL_1) 1031 .addReg(SrcSubReg) 1032 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) 1033 .addReg(SrcSubReg) 1034 .addImm(0) // op_sel_lo 1035 .addImm(0) // op_sel_hi 1036 .addImm(0) // neg_lo 1037 .addImm(0) // neg_hi 1038 .addImm(0) // clamp 1039 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 1040 if (IsFirstSubreg) 1041 MIB.addReg(DestReg, RegState::Define | RegState::Implicit); 1042 } else { 1043 MachineInstrBuilder Builder = 1044 BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) 1045 .addReg(RI.getSubReg(SrcReg, SubIdx)); 1046 if (IsFirstSubreg) 1047 Builder.addReg(DestReg, RegState::Define | RegState::Implicit); 1048 1049 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); 1050 } 1051 } 1052 } 1053 1054 int SIInstrInfo::commuteOpcode(unsigned Opcode) const { 1055 int NewOpc; 1056 1057 // Try to map original to commuted opcode 1058 NewOpc = AMDGPU::getCommuteRev(Opcode); 1059 if (NewOpc != -1) 1060 // Check if the commuted (REV) opcode exists on the target. 1061 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 1062 1063 // Try to map commuted to original opcode 1064 NewOpc = AMDGPU::getCommuteOrig(Opcode); 1065 if (NewOpc != -1) 1066 // Check if the original (non-REV) opcode exists on the target. 1067 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; 1068 1069 return Opcode; 1070 } 1071 1072 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, 1073 MachineBasicBlock::iterator MI, 1074 const DebugLoc &DL, Register DestReg, 1075 int64_t Value) const { 1076 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1077 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); 1078 if (RegClass == &AMDGPU::SReg_32RegClass || 1079 RegClass == &AMDGPU::SGPR_32RegClass || 1080 RegClass == &AMDGPU::SReg_32_XM0RegClass || 1081 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { 1082 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) 1083 .addImm(Value); 1084 return; 1085 } 1086 1087 if (RegClass == &AMDGPU::SReg_64RegClass || 1088 RegClass == &AMDGPU::SGPR_64RegClass || 1089 RegClass == &AMDGPU::SReg_64_XEXECRegClass) { 1090 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) 1091 .addImm(Value); 1092 return; 1093 } 1094 1095 if (RegClass == &AMDGPU::VGPR_32RegClass) { 1096 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) 1097 .addImm(Value); 1098 return; 1099 } 1100 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { 1101 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) 1102 .addImm(Value); 1103 return; 1104 } 1105 1106 unsigned EltSize = 4; 1107 unsigned Opcode = AMDGPU::V_MOV_B32_e32; 1108 if (RI.isSGPRClass(RegClass)) { 1109 if (RI.getRegSizeInBits(*RegClass) > 32) { 1110 Opcode = AMDGPU::S_MOV_B64; 1111 EltSize = 8; 1112 } else { 1113 Opcode = AMDGPU::S_MOV_B32; 1114 EltSize = 4; 1115 } 1116 } 1117 1118 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); 1119 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { 1120 int64_t IdxValue = Idx == 0 ? Value : 0; 1121 1122 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, 1123 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); 1124 Builder.addImm(IdxValue); 1125 } 1126 } 1127 1128 const TargetRegisterClass * 1129 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { 1130 return &AMDGPU::VGPR_32RegClass; 1131 } 1132 1133 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, 1134 MachineBasicBlock::iterator I, 1135 const DebugLoc &DL, Register DstReg, 1136 ArrayRef<MachineOperand> Cond, 1137 Register TrueReg, 1138 Register FalseReg) const { 1139 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 1140 const TargetRegisterClass *BoolXExecRC = 1141 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1142 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && 1143 "Not a VGPR32 reg"); 1144 1145 if (Cond.size() == 1) { 1146 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1147 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1148 .add(Cond[0]); 1149 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1150 .addImm(0) 1151 .addReg(FalseReg) 1152 .addImm(0) 1153 .addReg(TrueReg) 1154 .addReg(SReg); 1155 } else if (Cond.size() == 2) { 1156 assert(Cond[0].isImm() && "Cond[0] is not an immediate"); 1157 switch (Cond[0].getImm()) { 1158 case SIInstrInfo::SCC_TRUE: { 1159 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1160 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1161 : AMDGPU::S_CSELECT_B64), SReg) 1162 .addImm(1) 1163 .addImm(0); 1164 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1165 .addImm(0) 1166 .addReg(FalseReg) 1167 .addImm(0) 1168 .addReg(TrueReg) 1169 .addReg(SReg); 1170 break; 1171 } 1172 case SIInstrInfo::SCC_FALSE: { 1173 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1174 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1175 : AMDGPU::S_CSELECT_B64), SReg) 1176 .addImm(0) 1177 .addImm(1); 1178 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1179 .addImm(0) 1180 .addReg(FalseReg) 1181 .addImm(0) 1182 .addReg(TrueReg) 1183 .addReg(SReg); 1184 break; 1185 } 1186 case SIInstrInfo::VCCNZ: { 1187 MachineOperand RegOp = Cond[1]; 1188 RegOp.setImplicit(false); 1189 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1190 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1191 .add(RegOp); 1192 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1193 .addImm(0) 1194 .addReg(FalseReg) 1195 .addImm(0) 1196 .addReg(TrueReg) 1197 .addReg(SReg); 1198 break; 1199 } 1200 case SIInstrInfo::VCCZ: { 1201 MachineOperand RegOp = Cond[1]; 1202 RegOp.setImplicit(false); 1203 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1204 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) 1205 .add(RegOp); 1206 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1207 .addImm(0) 1208 .addReg(TrueReg) 1209 .addImm(0) 1210 .addReg(FalseReg) 1211 .addReg(SReg); 1212 break; 1213 } 1214 case SIInstrInfo::EXECNZ: { 1215 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1216 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1217 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1218 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1219 .addImm(0); 1220 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1221 : AMDGPU::S_CSELECT_B64), SReg) 1222 .addImm(1) 1223 .addImm(0); 1224 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1225 .addImm(0) 1226 .addReg(FalseReg) 1227 .addImm(0) 1228 .addReg(TrueReg) 1229 .addReg(SReg); 1230 break; 1231 } 1232 case SIInstrInfo::EXECZ: { 1233 Register SReg = MRI.createVirtualRegister(BoolXExecRC); 1234 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); 1235 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 1236 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) 1237 .addImm(0); 1238 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 1239 : AMDGPU::S_CSELECT_B64), SReg) 1240 .addImm(0) 1241 .addImm(1); 1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) 1243 .addImm(0) 1244 .addReg(FalseReg) 1245 .addImm(0) 1246 .addReg(TrueReg) 1247 .addReg(SReg); 1248 llvm_unreachable("Unhandled branch predicate EXECZ"); 1249 break; 1250 } 1251 default: 1252 llvm_unreachable("invalid branch predicate"); 1253 } 1254 } else { 1255 llvm_unreachable("Can only handle Cond size 1 or 2"); 1256 } 1257 } 1258 1259 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, 1260 MachineBasicBlock::iterator I, 1261 const DebugLoc &DL, 1262 Register SrcReg, int Value) const { 1263 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1264 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1265 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) 1266 .addImm(Value) 1267 .addReg(SrcReg); 1268 1269 return Reg; 1270 } 1271 1272 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, 1273 MachineBasicBlock::iterator I, 1274 const DebugLoc &DL, 1275 Register SrcReg, int Value) const { 1276 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 1277 Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); 1278 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) 1279 .addImm(Value) 1280 .addReg(SrcReg); 1281 1282 return Reg; 1283 } 1284 1285 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { 1286 1287 if (RI.isAGPRClass(DstRC)) 1288 return AMDGPU::COPY; 1289 if (RI.getRegSizeInBits(*DstRC) == 32) { 1290 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; 1291 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { 1292 return AMDGPU::S_MOV_B64; 1293 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { 1294 return AMDGPU::V_MOV_B64_PSEUDO; 1295 } 1296 return AMDGPU::COPY; 1297 } 1298 1299 const MCInstrDesc & 1300 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, 1301 bool IsIndirectSrc) const { 1302 if (IsIndirectSrc) { 1303 if (VecSize <= 32) // 4 bytes 1304 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); 1305 if (VecSize <= 64) // 8 bytes 1306 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); 1307 if (VecSize <= 96) // 12 bytes 1308 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); 1309 if (VecSize <= 128) // 16 bytes 1310 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); 1311 if (VecSize <= 160) // 20 bytes 1312 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); 1313 if (VecSize <= 256) // 32 bytes 1314 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); 1315 if (VecSize <= 288) // 36 bytes 1316 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9); 1317 if (VecSize <= 320) // 40 bytes 1318 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10); 1319 if (VecSize <= 352) // 44 bytes 1320 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11); 1321 if (VecSize <= 384) // 48 bytes 1322 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12); 1323 if (VecSize <= 512) // 64 bytes 1324 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); 1325 if (VecSize <= 1024) // 128 bytes 1326 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); 1327 1328 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos"); 1329 } 1330 1331 if (VecSize <= 32) // 4 bytes 1332 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); 1333 if (VecSize <= 64) // 8 bytes 1334 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); 1335 if (VecSize <= 96) // 12 bytes 1336 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); 1337 if (VecSize <= 128) // 16 bytes 1338 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); 1339 if (VecSize <= 160) // 20 bytes 1340 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); 1341 if (VecSize <= 256) // 32 bytes 1342 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); 1343 if (VecSize <= 288) // 36 bytes 1344 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9); 1345 if (VecSize <= 320) // 40 bytes 1346 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10); 1347 if (VecSize <= 352) // 44 bytes 1348 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11); 1349 if (VecSize <= 384) // 48 bytes 1350 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12); 1351 if (VecSize <= 512) // 64 bytes 1352 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); 1353 if (VecSize <= 1024) // 128 bytes 1354 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); 1355 1356 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos"); 1357 } 1358 1359 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { 1360 if (VecSize <= 32) // 4 bytes 1361 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; 1362 if (VecSize <= 64) // 8 bytes 1363 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1364 if (VecSize <= 96) // 12 bytes 1365 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; 1366 if (VecSize <= 128) // 16 bytes 1367 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1368 if (VecSize <= 160) // 20 bytes 1369 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; 1370 if (VecSize <= 256) // 32 bytes 1371 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; 1372 if (VecSize <= 288) // 36 bytes 1373 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9; 1374 if (VecSize <= 320) // 40 bytes 1375 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10; 1376 if (VecSize <= 352) // 44 bytes 1377 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11; 1378 if (VecSize <= 384) // 48 bytes 1379 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12; 1380 if (VecSize <= 512) // 64 bytes 1381 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; 1382 if (VecSize <= 1024) // 128 bytes 1383 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; 1384 1385 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1386 } 1387 1388 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { 1389 if (VecSize <= 32) // 4 bytes 1390 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; 1391 if (VecSize <= 64) // 8 bytes 1392 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; 1393 if (VecSize <= 96) // 12 bytes 1394 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; 1395 if (VecSize <= 128) // 16 bytes 1396 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; 1397 if (VecSize <= 160) // 20 bytes 1398 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; 1399 if (VecSize <= 256) // 32 bytes 1400 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; 1401 if (VecSize <= 288) // 36 bytes 1402 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9; 1403 if (VecSize <= 320) // 40 bytes 1404 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10; 1405 if (VecSize <= 352) // 44 bytes 1406 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11; 1407 if (VecSize <= 384) // 48 bytes 1408 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12; 1409 if (VecSize <= 512) // 64 bytes 1410 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; 1411 if (VecSize <= 1024) // 128 bytes 1412 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; 1413 1414 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1415 } 1416 1417 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { 1418 if (VecSize <= 64) // 8 bytes 1419 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; 1420 if (VecSize <= 128) // 16 bytes 1421 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; 1422 if (VecSize <= 256) // 32 bytes 1423 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; 1424 if (VecSize <= 512) // 64 bytes 1425 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; 1426 if (VecSize <= 1024) // 128 bytes 1427 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; 1428 1429 llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); 1430 } 1431 1432 const MCInstrDesc & 1433 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, 1434 bool IsSGPR) const { 1435 if (IsSGPR) { 1436 switch (EltSize) { 1437 case 32: 1438 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); 1439 case 64: 1440 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); 1441 default: 1442 llvm_unreachable("invalid reg indexing elt size"); 1443 } 1444 } 1445 1446 assert(EltSize == 32 && "invalid reg indexing elt size"); 1447 return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); 1448 } 1449 1450 static unsigned getSGPRSpillSaveOpcode(unsigned Size) { 1451 switch (Size) { 1452 case 4: 1453 return AMDGPU::SI_SPILL_S32_SAVE; 1454 case 8: 1455 return AMDGPU::SI_SPILL_S64_SAVE; 1456 case 12: 1457 return AMDGPU::SI_SPILL_S96_SAVE; 1458 case 16: 1459 return AMDGPU::SI_SPILL_S128_SAVE; 1460 case 20: 1461 return AMDGPU::SI_SPILL_S160_SAVE; 1462 case 24: 1463 return AMDGPU::SI_SPILL_S192_SAVE; 1464 case 28: 1465 return AMDGPU::SI_SPILL_S224_SAVE; 1466 case 32: 1467 return AMDGPU::SI_SPILL_S256_SAVE; 1468 case 36: 1469 return AMDGPU::SI_SPILL_S288_SAVE; 1470 case 40: 1471 return AMDGPU::SI_SPILL_S320_SAVE; 1472 case 44: 1473 return AMDGPU::SI_SPILL_S352_SAVE; 1474 case 48: 1475 return AMDGPU::SI_SPILL_S384_SAVE; 1476 case 64: 1477 return AMDGPU::SI_SPILL_S512_SAVE; 1478 case 128: 1479 return AMDGPU::SI_SPILL_S1024_SAVE; 1480 default: 1481 llvm_unreachable("unknown register size"); 1482 } 1483 } 1484 1485 static unsigned getVGPRSpillSaveOpcode(unsigned Size) { 1486 switch (Size) { 1487 case 4: 1488 return AMDGPU::SI_SPILL_V32_SAVE; 1489 case 8: 1490 return AMDGPU::SI_SPILL_V64_SAVE; 1491 case 12: 1492 return AMDGPU::SI_SPILL_V96_SAVE; 1493 case 16: 1494 return AMDGPU::SI_SPILL_V128_SAVE; 1495 case 20: 1496 return AMDGPU::SI_SPILL_V160_SAVE; 1497 case 24: 1498 return AMDGPU::SI_SPILL_V192_SAVE; 1499 case 28: 1500 return AMDGPU::SI_SPILL_V224_SAVE; 1501 case 32: 1502 return AMDGPU::SI_SPILL_V256_SAVE; 1503 case 36: 1504 return AMDGPU::SI_SPILL_V288_SAVE; 1505 case 40: 1506 return AMDGPU::SI_SPILL_V320_SAVE; 1507 case 44: 1508 return AMDGPU::SI_SPILL_V352_SAVE; 1509 case 48: 1510 return AMDGPU::SI_SPILL_V384_SAVE; 1511 case 64: 1512 return AMDGPU::SI_SPILL_V512_SAVE; 1513 case 128: 1514 return AMDGPU::SI_SPILL_V1024_SAVE; 1515 default: 1516 llvm_unreachable("unknown register size"); 1517 } 1518 } 1519 1520 static unsigned getAGPRSpillSaveOpcode(unsigned Size) { 1521 switch (Size) { 1522 case 4: 1523 return AMDGPU::SI_SPILL_A32_SAVE; 1524 case 8: 1525 return AMDGPU::SI_SPILL_A64_SAVE; 1526 case 12: 1527 return AMDGPU::SI_SPILL_A96_SAVE; 1528 case 16: 1529 return AMDGPU::SI_SPILL_A128_SAVE; 1530 case 20: 1531 return AMDGPU::SI_SPILL_A160_SAVE; 1532 case 24: 1533 return AMDGPU::SI_SPILL_A192_SAVE; 1534 case 28: 1535 return AMDGPU::SI_SPILL_A224_SAVE; 1536 case 32: 1537 return AMDGPU::SI_SPILL_A256_SAVE; 1538 case 36: 1539 return AMDGPU::SI_SPILL_A288_SAVE; 1540 case 40: 1541 return AMDGPU::SI_SPILL_A320_SAVE; 1542 case 44: 1543 return AMDGPU::SI_SPILL_A352_SAVE; 1544 case 48: 1545 return AMDGPU::SI_SPILL_A384_SAVE; 1546 case 64: 1547 return AMDGPU::SI_SPILL_A512_SAVE; 1548 case 128: 1549 return AMDGPU::SI_SPILL_A1024_SAVE; 1550 default: 1551 llvm_unreachable("unknown register size"); 1552 } 1553 } 1554 1555 static unsigned getAVSpillSaveOpcode(unsigned Size) { 1556 switch (Size) { 1557 case 4: 1558 return AMDGPU::SI_SPILL_AV32_SAVE; 1559 case 8: 1560 return AMDGPU::SI_SPILL_AV64_SAVE; 1561 case 12: 1562 return AMDGPU::SI_SPILL_AV96_SAVE; 1563 case 16: 1564 return AMDGPU::SI_SPILL_AV128_SAVE; 1565 case 20: 1566 return AMDGPU::SI_SPILL_AV160_SAVE; 1567 case 24: 1568 return AMDGPU::SI_SPILL_AV192_SAVE; 1569 case 28: 1570 return AMDGPU::SI_SPILL_AV224_SAVE; 1571 case 32: 1572 return AMDGPU::SI_SPILL_AV256_SAVE; 1573 case 36: 1574 return AMDGPU::SI_SPILL_AV288_SAVE; 1575 case 40: 1576 return AMDGPU::SI_SPILL_AV320_SAVE; 1577 case 44: 1578 return AMDGPU::SI_SPILL_AV352_SAVE; 1579 case 48: 1580 return AMDGPU::SI_SPILL_AV384_SAVE; 1581 case 64: 1582 return AMDGPU::SI_SPILL_AV512_SAVE; 1583 case 128: 1584 return AMDGPU::SI_SPILL_AV1024_SAVE; 1585 default: 1586 llvm_unreachable("unknown register size"); 1587 } 1588 } 1589 1590 static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { 1591 // Currently, there is only 32-bit WWM register spills needed. 1592 if (Size != 4) 1593 llvm_unreachable("unknown wwm register spill size"); 1594 1595 return AMDGPU::SI_SPILL_WWM_V32_SAVE; 1596 } 1597 1598 static unsigned getVectorRegSpillSaveOpcode(Register Reg, 1599 const TargetRegisterClass *RC, 1600 unsigned Size, 1601 const SIRegisterInfo &TRI, 1602 const SIMachineFunctionInfo &MFI) { 1603 // Choose the right opcode if spilling a WWM register. 1604 if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) 1605 return getWWMRegSpillSaveOpcode(Size); 1606 1607 if (TRI.isVectorSuperClass(RC)) 1608 return getAVSpillSaveOpcode(Size); 1609 1610 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) 1611 : getVGPRSpillSaveOpcode(Size); 1612 } 1613 1614 void SIInstrInfo::storeRegToStackSlot( 1615 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, 1616 bool isKill, int FrameIndex, const TargetRegisterClass *RC, 1617 const TargetRegisterInfo *TRI, Register VReg) const { 1618 MachineFunction *MF = MBB.getParent(); 1619 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1620 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1621 const DebugLoc &DL = MBB.findDebugLoc(MI); 1622 1623 MachinePointerInfo PtrInfo 1624 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1625 MachineMemOperand *MMO = MF->getMachineMemOperand( 1626 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), 1627 FrameInfo.getObjectAlign(FrameIndex)); 1628 unsigned SpillSize = TRI->getSpillSize(*RC); 1629 1630 MachineRegisterInfo &MRI = MF->getRegInfo(); 1631 if (RI.isSGPRClass(RC)) { 1632 MFI->setHasSpilledSGPRs(); 1633 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); 1634 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && 1635 SrcReg != AMDGPU::EXEC && "exec should not be spilled"); 1636 1637 // We are only allowed to create one new instruction when spilling 1638 // registers, so we need to use pseudo instruction for spilling SGPRs. 1639 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); 1640 1641 // The SGPR spill/restore instructions only work on number sgprs, so we need 1642 // to make sure we are using the correct register class. 1643 if (SrcReg.isVirtual() && SpillSize == 4) { 1644 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1645 } 1646 1647 BuildMI(MBB, MI, DL, OpDesc) 1648 .addReg(SrcReg, getKillRegState(isKill)) // data 1649 .addFrameIndex(FrameIndex) // addr 1650 .addMemOperand(MMO) 1651 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1652 1653 if (RI.spillSGPRToVGPR()) 1654 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1655 return; 1656 } 1657 1658 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, 1659 SpillSize, RI, *MFI); 1660 MFI->setHasSpilledVGPRs(); 1661 1662 BuildMI(MBB, MI, DL, get(Opcode)) 1663 .addReg(SrcReg, getKillRegState(isKill)) // data 1664 .addFrameIndex(FrameIndex) // addr 1665 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1666 .addImm(0) // offset 1667 .addMemOperand(MMO); 1668 } 1669 1670 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { 1671 switch (Size) { 1672 case 4: 1673 return AMDGPU::SI_SPILL_S32_RESTORE; 1674 case 8: 1675 return AMDGPU::SI_SPILL_S64_RESTORE; 1676 case 12: 1677 return AMDGPU::SI_SPILL_S96_RESTORE; 1678 case 16: 1679 return AMDGPU::SI_SPILL_S128_RESTORE; 1680 case 20: 1681 return AMDGPU::SI_SPILL_S160_RESTORE; 1682 case 24: 1683 return AMDGPU::SI_SPILL_S192_RESTORE; 1684 case 28: 1685 return AMDGPU::SI_SPILL_S224_RESTORE; 1686 case 32: 1687 return AMDGPU::SI_SPILL_S256_RESTORE; 1688 case 36: 1689 return AMDGPU::SI_SPILL_S288_RESTORE; 1690 case 40: 1691 return AMDGPU::SI_SPILL_S320_RESTORE; 1692 case 44: 1693 return AMDGPU::SI_SPILL_S352_RESTORE; 1694 case 48: 1695 return AMDGPU::SI_SPILL_S384_RESTORE; 1696 case 64: 1697 return AMDGPU::SI_SPILL_S512_RESTORE; 1698 case 128: 1699 return AMDGPU::SI_SPILL_S1024_RESTORE; 1700 default: 1701 llvm_unreachable("unknown register size"); 1702 } 1703 } 1704 1705 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { 1706 switch (Size) { 1707 case 4: 1708 return AMDGPU::SI_SPILL_V32_RESTORE; 1709 case 8: 1710 return AMDGPU::SI_SPILL_V64_RESTORE; 1711 case 12: 1712 return AMDGPU::SI_SPILL_V96_RESTORE; 1713 case 16: 1714 return AMDGPU::SI_SPILL_V128_RESTORE; 1715 case 20: 1716 return AMDGPU::SI_SPILL_V160_RESTORE; 1717 case 24: 1718 return AMDGPU::SI_SPILL_V192_RESTORE; 1719 case 28: 1720 return AMDGPU::SI_SPILL_V224_RESTORE; 1721 case 32: 1722 return AMDGPU::SI_SPILL_V256_RESTORE; 1723 case 36: 1724 return AMDGPU::SI_SPILL_V288_RESTORE; 1725 case 40: 1726 return AMDGPU::SI_SPILL_V320_RESTORE; 1727 case 44: 1728 return AMDGPU::SI_SPILL_V352_RESTORE; 1729 case 48: 1730 return AMDGPU::SI_SPILL_V384_RESTORE; 1731 case 64: 1732 return AMDGPU::SI_SPILL_V512_RESTORE; 1733 case 128: 1734 return AMDGPU::SI_SPILL_V1024_RESTORE; 1735 default: 1736 llvm_unreachable("unknown register size"); 1737 } 1738 } 1739 1740 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { 1741 switch (Size) { 1742 case 4: 1743 return AMDGPU::SI_SPILL_A32_RESTORE; 1744 case 8: 1745 return AMDGPU::SI_SPILL_A64_RESTORE; 1746 case 12: 1747 return AMDGPU::SI_SPILL_A96_RESTORE; 1748 case 16: 1749 return AMDGPU::SI_SPILL_A128_RESTORE; 1750 case 20: 1751 return AMDGPU::SI_SPILL_A160_RESTORE; 1752 case 24: 1753 return AMDGPU::SI_SPILL_A192_RESTORE; 1754 case 28: 1755 return AMDGPU::SI_SPILL_A224_RESTORE; 1756 case 32: 1757 return AMDGPU::SI_SPILL_A256_RESTORE; 1758 case 36: 1759 return AMDGPU::SI_SPILL_A288_RESTORE; 1760 case 40: 1761 return AMDGPU::SI_SPILL_A320_RESTORE; 1762 case 44: 1763 return AMDGPU::SI_SPILL_A352_RESTORE; 1764 case 48: 1765 return AMDGPU::SI_SPILL_A384_RESTORE; 1766 case 64: 1767 return AMDGPU::SI_SPILL_A512_RESTORE; 1768 case 128: 1769 return AMDGPU::SI_SPILL_A1024_RESTORE; 1770 default: 1771 llvm_unreachable("unknown register size"); 1772 } 1773 } 1774 1775 static unsigned getAVSpillRestoreOpcode(unsigned Size) { 1776 switch (Size) { 1777 case 4: 1778 return AMDGPU::SI_SPILL_AV32_RESTORE; 1779 case 8: 1780 return AMDGPU::SI_SPILL_AV64_RESTORE; 1781 case 12: 1782 return AMDGPU::SI_SPILL_AV96_RESTORE; 1783 case 16: 1784 return AMDGPU::SI_SPILL_AV128_RESTORE; 1785 case 20: 1786 return AMDGPU::SI_SPILL_AV160_RESTORE; 1787 case 24: 1788 return AMDGPU::SI_SPILL_AV192_RESTORE; 1789 case 28: 1790 return AMDGPU::SI_SPILL_AV224_RESTORE; 1791 case 32: 1792 return AMDGPU::SI_SPILL_AV256_RESTORE; 1793 case 36: 1794 return AMDGPU::SI_SPILL_AV288_RESTORE; 1795 case 40: 1796 return AMDGPU::SI_SPILL_AV320_RESTORE; 1797 case 44: 1798 return AMDGPU::SI_SPILL_AV352_RESTORE; 1799 case 48: 1800 return AMDGPU::SI_SPILL_AV384_RESTORE; 1801 case 64: 1802 return AMDGPU::SI_SPILL_AV512_RESTORE; 1803 case 128: 1804 return AMDGPU::SI_SPILL_AV1024_RESTORE; 1805 default: 1806 llvm_unreachable("unknown register size"); 1807 } 1808 } 1809 1810 static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { 1811 // Currently, there is only 32-bit WWM register spills needed. 1812 if (Size != 4) 1813 llvm_unreachable("unknown wwm register spill size"); 1814 1815 return AMDGPU::SI_SPILL_WWM_V32_RESTORE; 1816 } 1817 1818 static unsigned 1819 getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, 1820 unsigned Size, const SIRegisterInfo &TRI, 1821 const SIMachineFunctionInfo &MFI) { 1822 // Choose the right opcode if restoring a WWM register. 1823 if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) 1824 return getWWMRegSpillRestoreOpcode(Size); 1825 1826 if (TRI.isVectorSuperClass(RC)) 1827 return getAVSpillRestoreOpcode(Size); 1828 1829 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) 1830 : getVGPRSpillRestoreOpcode(Size); 1831 } 1832 1833 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 1834 MachineBasicBlock::iterator MI, 1835 Register DestReg, int FrameIndex, 1836 const TargetRegisterClass *RC, 1837 const TargetRegisterInfo *TRI, 1838 Register VReg) const { 1839 MachineFunction *MF = MBB.getParent(); 1840 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 1841 MachineFrameInfo &FrameInfo = MF->getFrameInfo(); 1842 const DebugLoc &DL = MBB.findDebugLoc(MI); 1843 unsigned SpillSize = TRI->getSpillSize(*RC); 1844 1845 MachinePointerInfo PtrInfo 1846 = MachinePointerInfo::getFixedStack(*MF, FrameIndex); 1847 1848 MachineMemOperand *MMO = MF->getMachineMemOperand( 1849 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), 1850 FrameInfo.getObjectAlign(FrameIndex)); 1851 1852 if (RI.isSGPRClass(RC)) { 1853 MFI->setHasSpilledSGPRs(); 1854 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); 1855 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && 1856 DestReg != AMDGPU::EXEC && "exec should not be spilled"); 1857 1858 // FIXME: Maybe this should not include a memoperand because it will be 1859 // lowered to non-memory instructions. 1860 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); 1861 if (DestReg.isVirtual() && SpillSize == 4) { 1862 MachineRegisterInfo &MRI = MF->getRegInfo(); 1863 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 1864 } 1865 1866 if (RI.spillSGPRToVGPR()) 1867 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); 1868 BuildMI(MBB, MI, DL, OpDesc, DestReg) 1869 .addFrameIndex(FrameIndex) // addr 1870 .addMemOperand(MMO) 1871 .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); 1872 1873 return; 1874 } 1875 1876 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, 1877 SpillSize, RI, *MFI); 1878 BuildMI(MBB, MI, DL, get(Opcode), DestReg) 1879 .addFrameIndex(FrameIndex) // vaddr 1880 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset 1881 .addImm(0) // offset 1882 .addMemOperand(MMO); 1883 } 1884 1885 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, 1886 MachineBasicBlock::iterator MI) const { 1887 insertNoops(MBB, MI, 1); 1888 } 1889 1890 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, 1891 MachineBasicBlock::iterator MI, 1892 unsigned Quantity) const { 1893 DebugLoc DL = MBB.findDebugLoc(MI); 1894 while (Quantity > 0) { 1895 unsigned Arg = std::min(Quantity, 8u); 1896 Quantity -= Arg; 1897 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); 1898 } 1899 } 1900 1901 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { 1902 auto MF = MBB.getParent(); 1903 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); 1904 1905 assert(Info->isEntryFunction()); 1906 1907 if (MBB.succ_empty()) { 1908 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); 1909 if (HasNoTerminator) { 1910 if (Info->returnsVoid()) { 1911 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); 1912 } else { 1913 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); 1914 } 1915 } 1916 } 1917 } 1918 1919 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { 1920 switch (MI.getOpcode()) { 1921 default: 1922 if (MI.isMetaInstruction()) 1923 return 0; 1924 return 1; // FIXME: Do wait states equal cycles? 1925 1926 case AMDGPU::S_NOP: 1927 return MI.getOperand(0).getImm() + 1; 1928 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The 1929 // hazard, even if one exist, won't really be visible. Should we handle it? 1930 } 1931 } 1932 1933 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1934 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1935 MachineBasicBlock &MBB = *MI.getParent(); 1936 DebugLoc DL = MBB.findDebugLoc(MI); 1937 switch (MI.getOpcode()) { 1938 default: return TargetInstrInfo::expandPostRAPseudo(MI); 1939 case AMDGPU::S_MOV_B64_term: 1940 // This is only a terminator to get the correct spill code placement during 1941 // register allocation. 1942 MI.setDesc(get(AMDGPU::S_MOV_B64)); 1943 break; 1944 1945 case AMDGPU::S_MOV_B32_term: 1946 // This is only a terminator to get the correct spill code placement during 1947 // register allocation. 1948 MI.setDesc(get(AMDGPU::S_MOV_B32)); 1949 break; 1950 1951 case AMDGPU::S_XOR_B64_term: 1952 // This is only a terminator to get the correct spill code placement during 1953 // register allocation. 1954 MI.setDesc(get(AMDGPU::S_XOR_B64)); 1955 break; 1956 1957 case AMDGPU::S_XOR_B32_term: 1958 // This is only a terminator to get the correct spill code placement during 1959 // register allocation. 1960 MI.setDesc(get(AMDGPU::S_XOR_B32)); 1961 break; 1962 case AMDGPU::S_OR_B64_term: 1963 // This is only a terminator to get the correct spill code placement during 1964 // register allocation. 1965 MI.setDesc(get(AMDGPU::S_OR_B64)); 1966 break; 1967 case AMDGPU::S_OR_B32_term: 1968 // This is only a terminator to get the correct spill code placement during 1969 // register allocation. 1970 MI.setDesc(get(AMDGPU::S_OR_B32)); 1971 break; 1972 1973 case AMDGPU::S_ANDN2_B64_term: 1974 // This is only a terminator to get the correct spill code placement during 1975 // register allocation. 1976 MI.setDesc(get(AMDGPU::S_ANDN2_B64)); 1977 break; 1978 1979 case AMDGPU::S_ANDN2_B32_term: 1980 // This is only a terminator to get the correct spill code placement during 1981 // register allocation. 1982 MI.setDesc(get(AMDGPU::S_ANDN2_B32)); 1983 break; 1984 1985 case AMDGPU::S_AND_B64_term: 1986 // This is only a terminator to get the correct spill code placement during 1987 // register allocation. 1988 MI.setDesc(get(AMDGPU::S_AND_B64)); 1989 break; 1990 1991 case AMDGPU::S_AND_B32_term: 1992 // This is only a terminator to get the correct spill code placement during 1993 // register allocation. 1994 MI.setDesc(get(AMDGPU::S_AND_B32)); 1995 break; 1996 1997 case AMDGPU::S_AND_SAVEEXEC_B64_term: 1998 // This is only a terminator to get the correct spill code placement during 1999 // register allocation. 2000 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64)); 2001 break; 2002 2003 case AMDGPU::S_AND_SAVEEXEC_B32_term: 2004 // This is only a terminator to get the correct spill code placement during 2005 // register allocation. 2006 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32)); 2007 break; 2008 2009 case AMDGPU::V_MOV_B64_PSEUDO: { 2010 Register Dst = MI.getOperand(0).getReg(); 2011 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 2012 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 2013 2014 const MachineOperand &SrcOp = MI.getOperand(1); 2015 // FIXME: Will this work for 64-bit floating point immediates? 2016 assert(!SrcOp.isFPImm()); 2017 if (ST.hasMovB64()) { 2018 MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); 2019 if (SrcOp.isReg() || isInlineConstant(MI, 1) || 2020 isUInt<32>(SrcOp.getImm())) 2021 break; 2022 } 2023 if (SrcOp.isImm()) { 2024 APInt Imm(64, SrcOp.getImm()); 2025 APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 2026 APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 2027 if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { 2028 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 2029 .addImm(SISrcMods::OP_SEL_1) 2030 .addImm(Lo.getSExtValue()) 2031 .addImm(SISrcMods::OP_SEL_1) 2032 .addImm(Lo.getSExtValue()) 2033 .addImm(0) // op_sel_lo 2034 .addImm(0) // op_sel_hi 2035 .addImm(0) // neg_lo 2036 .addImm(0) // neg_hi 2037 .addImm(0); // clamp 2038 } else { 2039 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 2040 .addImm(Lo.getSExtValue()) 2041 .addReg(Dst, RegState::Implicit | RegState::Define); 2042 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 2043 .addImm(Hi.getSExtValue()) 2044 .addReg(Dst, RegState::Implicit | RegState::Define); 2045 } 2046 } else { 2047 assert(SrcOp.isReg()); 2048 if (ST.hasPackedFP32Ops() && 2049 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { 2050 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) 2051 .addImm(SISrcMods::OP_SEL_1) // src0_mod 2052 .addReg(SrcOp.getReg()) 2053 .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod 2054 .addReg(SrcOp.getReg()) 2055 .addImm(0) // op_sel_lo 2056 .addImm(0) // op_sel_hi 2057 .addImm(0) // neg_lo 2058 .addImm(0) // neg_hi 2059 .addImm(0); // clamp 2060 } else { 2061 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) 2062 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) 2063 .addReg(Dst, RegState::Implicit | RegState::Define); 2064 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) 2065 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) 2066 .addReg(Dst, RegState::Implicit | RegState::Define); 2067 } 2068 } 2069 MI.eraseFromParent(); 2070 break; 2071 } 2072 case AMDGPU::V_MOV_B64_DPP_PSEUDO: { 2073 expandMovDPP64(MI); 2074 break; 2075 } 2076 case AMDGPU::S_MOV_B64_IMM_PSEUDO: { 2077 const MachineOperand &SrcOp = MI.getOperand(1); 2078 assert(!SrcOp.isFPImm()); 2079 APInt Imm(64, SrcOp.getImm()); 2080 if (Imm.isIntN(32) || isInlineConstant(Imm)) { 2081 MI.setDesc(get(AMDGPU::S_MOV_B64)); 2082 break; 2083 } 2084 2085 Register Dst = MI.getOperand(0).getReg(); 2086 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); 2087 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); 2088 2089 APInt Lo(32, Imm.getLoBits(32).getZExtValue()); 2090 APInt Hi(32, Imm.getHiBits(32).getZExtValue()); 2091 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) 2092 .addImm(Lo.getSExtValue()) 2093 .addReg(Dst, RegState::Implicit | RegState::Define); 2094 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) 2095 .addImm(Hi.getSExtValue()) 2096 .addReg(Dst, RegState::Implicit | RegState::Define); 2097 MI.eraseFromParent(); 2098 break; 2099 } 2100 case AMDGPU::V_SET_INACTIVE_B32: { 2101 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 2102 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 2103 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM 2104 // optimizations (mainly Register Coalescer) aware of WWM register liveness. 2105 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 2106 .add(MI.getOperand(1)); 2107 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 2108 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 2109 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) 2110 .add(MI.getOperand(2)); 2111 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 2112 .addReg(Exec); 2113 MI.eraseFromParent(); 2114 break; 2115 } 2116 case AMDGPU::V_SET_INACTIVE_B64: { 2117 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; 2118 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 2119 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 2120 MI.getOperand(0).getReg()) 2121 .add(MI.getOperand(1)); 2122 expandPostRAPseudo(*Copy); 2123 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); 2124 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten 2125 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), 2126 MI.getOperand(0).getReg()) 2127 .add(MI.getOperand(2)); 2128 expandPostRAPseudo(*Copy); 2129 BuildMI(MBB, MI, DL, get(NotOpc), Exec) 2130 .addReg(Exec); 2131 MI.eraseFromParent(); 2132 break; 2133 } 2134 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: 2135 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: 2136 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: 2137 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: 2138 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: 2139 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: 2140 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9: 2141 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10: 2142 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11: 2143 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12: 2144 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: 2145 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: 2146 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: 2147 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: 2148 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: 2149 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: 2150 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: 2151 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: 2152 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9: 2153 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10: 2154 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11: 2155 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12: 2156 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: 2157 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: 2158 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: 2159 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: 2160 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: 2161 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: 2162 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { 2163 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); 2164 2165 unsigned Opc; 2166 if (RI.hasVGPRs(EltRC)) { 2167 Opc = AMDGPU::V_MOVRELD_B32_e32; 2168 } else { 2169 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 2170 : AMDGPU::S_MOVRELD_B32; 2171 } 2172 2173 const MCInstrDesc &OpDesc = get(Opc); 2174 Register VecReg = MI.getOperand(0).getReg(); 2175 bool IsUndef = MI.getOperand(1).isUndef(); 2176 unsigned SubReg = MI.getOperand(3).getImm(); 2177 assert(VecReg == MI.getOperand(1).getReg()); 2178 2179 MachineInstrBuilder MIB = 2180 BuildMI(MBB, MI, DL, OpDesc) 2181 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2182 .add(MI.getOperand(2)) 2183 .addReg(VecReg, RegState::ImplicitDefine) 2184 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2185 2186 const int ImpDefIdx = 2187 OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); 2188 const int ImpUseIdx = ImpDefIdx + 1; 2189 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 2190 MI.eraseFromParent(); 2191 break; 2192 } 2193 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: 2194 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: 2195 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: 2196 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: 2197 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: 2198 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: 2199 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9: 2200 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10: 2201 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11: 2202 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12: 2203 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: 2204 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { 2205 assert(ST.useVGPRIndexMode()); 2206 Register VecReg = MI.getOperand(0).getReg(); 2207 bool IsUndef = MI.getOperand(1).isUndef(); 2208 Register Idx = MI.getOperand(3).getReg(); 2209 Register SubReg = MI.getOperand(4).getImm(); 2210 2211 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 2212 .addReg(Idx) 2213 .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); 2214 SetOn->getOperand(3).setIsUndef(); 2215 2216 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write); 2217 MachineInstrBuilder MIB = 2218 BuildMI(MBB, MI, DL, OpDesc) 2219 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2220 .add(MI.getOperand(2)) 2221 .addReg(VecReg, RegState::ImplicitDefine) 2222 .addReg(VecReg, 2223 RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2224 2225 const int ImpDefIdx = 2226 OpDesc.getNumOperands() + OpDesc.implicit_uses().size(); 2227 const int ImpUseIdx = ImpDefIdx + 1; 2228 MIB->tieOperands(ImpDefIdx, ImpUseIdx); 2229 2230 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 2231 2232 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 2233 2234 MI.eraseFromParent(); 2235 break; 2236 } 2237 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: 2238 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: 2239 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: 2240 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: 2241 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: 2242 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: 2243 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9: 2244 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10: 2245 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11: 2246 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12: 2247 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: 2248 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { 2249 assert(ST.useVGPRIndexMode()); 2250 Register Dst = MI.getOperand(0).getReg(); 2251 Register VecReg = MI.getOperand(1).getReg(); 2252 bool IsUndef = MI.getOperand(1).isUndef(); 2253 Register Idx = MI.getOperand(2).getReg(); 2254 Register SubReg = MI.getOperand(3).getImm(); 2255 2256 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) 2257 .addReg(Idx) 2258 .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); 2259 SetOn->getOperand(3).setIsUndef(); 2260 2261 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read)) 2262 .addDef(Dst) 2263 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) 2264 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); 2265 2266 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); 2267 2268 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); 2269 2270 MI.eraseFromParent(); 2271 break; 2272 } 2273 case AMDGPU::SI_PC_ADD_REL_OFFSET: { 2274 MachineFunction &MF = *MBB.getParent(); 2275 Register Reg = MI.getOperand(0).getReg(); 2276 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); 2277 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); 2278 2279 // Create a bundle so these instructions won't be re-ordered by the 2280 // post-RA scheduler. 2281 MIBundleBuilder Bundler(MBB, MI); 2282 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); 2283 2284 // Add 32-bit offset from this instruction to the start of the 2285 // constant data. 2286 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) 2287 .addReg(RegLo) 2288 .add(MI.getOperand(1))); 2289 2290 MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) 2291 .addReg(RegHi); 2292 MIB.add(MI.getOperand(2)); 2293 2294 Bundler.append(MIB); 2295 finalizeBundle(MBB, Bundler.begin()); 2296 2297 MI.eraseFromParent(); 2298 break; 2299 } 2300 case AMDGPU::ENTER_STRICT_WWM: { 2301 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2302 // Whole Wave Mode is entered. 2303 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 2304 : AMDGPU::S_OR_SAVEEXEC_B64)); 2305 break; 2306 } 2307 case AMDGPU::ENTER_STRICT_WQM: { 2308 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2309 // STRICT_WQM is entered. 2310 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 2311 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; 2312 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 2313 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); 2314 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); 2315 2316 MI.eraseFromParent(); 2317 break; 2318 } 2319 case AMDGPU::EXIT_STRICT_WWM: 2320 case AMDGPU::EXIT_STRICT_WQM: { 2321 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when 2322 // WWM/STICT_WQM is exited. 2323 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); 2324 break; 2325 } 2326 case AMDGPU::ENTER_PSEUDO_WM: 2327 case AMDGPU::EXIT_PSEUDO_WM: { 2328 // These do nothing. 2329 MI.eraseFromParent(); 2330 break; 2331 } 2332 case AMDGPU::SI_RETURN: { 2333 const MachineFunction *MF = MBB.getParent(); 2334 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 2335 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2336 // Hiding the return address use with SI_RETURN may lead to extra kills in 2337 // the function and missing live-ins. We are fine in practice because callee 2338 // saved register handling ensures the register value is restored before 2339 // RET, but we need the undef flag here to appease the MachineVerifier 2340 // liveness checks. 2341 MachineInstrBuilder MIB = 2342 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return)) 2343 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef); 2344 2345 MIB.copyImplicitOps(MI); 2346 MI.eraseFromParent(); 2347 break; 2348 } 2349 } 2350 return true; 2351 } 2352 2353 std::pair<MachineInstr*, MachineInstr*> 2354 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { 2355 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); 2356 2357 if (ST.hasMovB64() && 2358 AMDGPU::isLegal64BitDPPControl( 2359 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { 2360 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); 2361 return std::pair(&MI, nullptr); 2362 } 2363 2364 MachineBasicBlock &MBB = *MI.getParent(); 2365 DebugLoc DL = MBB.findDebugLoc(MI); 2366 MachineFunction *MF = MBB.getParent(); 2367 MachineRegisterInfo &MRI = MF->getRegInfo(); 2368 Register Dst = MI.getOperand(0).getReg(); 2369 unsigned Part = 0; 2370 MachineInstr *Split[2]; 2371 2372 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { 2373 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); 2374 if (Dst.isPhysical()) { 2375 MovDPP.addDef(RI.getSubReg(Dst, Sub)); 2376 } else { 2377 assert(MRI.isSSA()); 2378 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2379 MovDPP.addDef(Tmp); 2380 } 2381 2382 for (unsigned I = 1; I <= 2; ++I) { // old and src operands. 2383 const MachineOperand &SrcOp = MI.getOperand(I); 2384 assert(!SrcOp.isFPImm()); 2385 if (SrcOp.isImm()) { 2386 APInt Imm(64, SrcOp.getImm()); 2387 Imm.ashrInPlace(Part * 32); 2388 MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); 2389 } else { 2390 assert(SrcOp.isReg()); 2391 Register Src = SrcOp.getReg(); 2392 if (Src.isPhysical()) 2393 MovDPP.addReg(RI.getSubReg(Src, Sub)); 2394 else 2395 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); 2396 } 2397 } 2398 2399 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3)) 2400 MovDPP.addImm(MO.getImm()); 2401 2402 Split[Part] = MovDPP; 2403 ++Part; 2404 } 2405 2406 if (Dst.isVirtual()) 2407 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) 2408 .addReg(Split[0]->getOperand(0).getReg()) 2409 .addImm(AMDGPU::sub0) 2410 .addReg(Split[1]->getOperand(0).getReg()) 2411 .addImm(AMDGPU::sub1); 2412 2413 MI.eraseFromParent(); 2414 return std::pair(Split[0], Split[1]); 2415 } 2416 2417 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, 2418 MachineOperand &Src0, 2419 unsigned Src0OpName, 2420 MachineOperand &Src1, 2421 unsigned Src1OpName) const { 2422 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); 2423 if (!Src0Mods) 2424 return false; 2425 2426 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); 2427 assert(Src1Mods && 2428 "All commutable instructions have both src0 and src1 modifiers"); 2429 2430 int Src0ModsVal = Src0Mods->getImm(); 2431 int Src1ModsVal = Src1Mods->getImm(); 2432 2433 Src1Mods->setImm(Src0ModsVal); 2434 Src0Mods->setImm(Src1ModsVal); 2435 return true; 2436 } 2437 2438 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, 2439 MachineOperand &RegOp, 2440 MachineOperand &NonRegOp) { 2441 Register Reg = RegOp.getReg(); 2442 unsigned SubReg = RegOp.getSubReg(); 2443 bool IsKill = RegOp.isKill(); 2444 bool IsDead = RegOp.isDead(); 2445 bool IsUndef = RegOp.isUndef(); 2446 bool IsDebug = RegOp.isDebug(); 2447 2448 if (NonRegOp.isImm()) 2449 RegOp.ChangeToImmediate(NonRegOp.getImm()); 2450 else if (NonRegOp.isFI()) 2451 RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); 2452 else if (NonRegOp.isGlobal()) { 2453 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), 2454 NonRegOp.getTargetFlags()); 2455 } else 2456 return nullptr; 2457 2458 // Make sure we don't reinterpret a subreg index in the target flags. 2459 RegOp.setTargetFlags(NonRegOp.getTargetFlags()); 2460 2461 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); 2462 NonRegOp.setSubReg(SubReg); 2463 2464 return &MI; 2465 } 2466 2467 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, 2468 unsigned Src0Idx, 2469 unsigned Src1Idx) const { 2470 assert(!NewMI && "this should never be used"); 2471 2472 unsigned Opc = MI.getOpcode(); 2473 int CommutedOpcode = commuteOpcode(Opc); 2474 if (CommutedOpcode == -1) 2475 return nullptr; 2476 2477 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == 2478 static_cast<int>(Src0Idx) && 2479 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == 2480 static_cast<int>(Src1Idx) && 2481 "inconsistency with findCommutedOpIndices"); 2482 2483 MachineOperand &Src0 = MI.getOperand(Src0Idx); 2484 MachineOperand &Src1 = MI.getOperand(Src1Idx); 2485 2486 MachineInstr *CommutedMI = nullptr; 2487 if (Src0.isReg() && Src1.isReg()) { 2488 if (isOperandLegal(MI, Src1Idx, &Src0)) { 2489 // Be sure to copy the source modifiers to the right place. 2490 CommutedMI 2491 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); 2492 } 2493 2494 } else if (Src0.isReg() && !Src1.isReg()) { 2495 // src0 should always be able to support any operand type, so no need to 2496 // check operand legality. 2497 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); 2498 } else if (!Src0.isReg() && Src1.isReg()) { 2499 if (isOperandLegal(MI, Src1Idx, &Src0)) 2500 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); 2501 } else { 2502 // FIXME: Found two non registers to commute. This does happen. 2503 return nullptr; 2504 } 2505 2506 if (CommutedMI) { 2507 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, 2508 Src1, AMDGPU::OpName::src1_modifiers); 2509 2510 CommutedMI->setDesc(get(CommutedOpcode)); 2511 } 2512 2513 return CommutedMI; 2514 } 2515 2516 // This needs to be implemented because the source modifiers may be inserted 2517 // between the true commutable operands, and the base 2518 // TargetInstrInfo::commuteInstruction uses it. 2519 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, 2520 unsigned &SrcOpIdx0, 2521 unsigned &SrcOpIdx1) const { 2522 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); 2523 } 2524 2525 bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc, 2526 unsigned &SrcOpIdx0, 2527 unsigned &SrcOpIdx1) const { 2528 if (!Desc.isCommutable()) 2529 return false; 2530 2531 unsigned Opc = Desc.getOpcode(); 2532 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 2533 if (Src0Idx == -1) 2534 return false; 2535 2536 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 2537 if (Src1Idx == -1) 2538 return false; 2539 2540 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); 2541 } 2542 2543 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, 2544 int64_t BrOffset) const { 2545 // BranchRelaxation should never have to check s_setpc_b64 because its dest 2546 // block is unanalyzable. 2547 assert(BranchOp != AMDGPU::S_SETPC_B64); 2548 2549 // Convert to dwords. 2550 BrOffset /= 4; 2551 2552 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is 2553 // from the next instruction. 2554 BrOffset -= 1; 2555 2556 return isIntN(BranchOffsetBits, BrOffset); 2557 } 2558 2559 MachineBasicBlock *SIInstrInfo::getBranchDestBlock( 2560 const MachineInstr &MI) const { 2561 if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { 2562 // This would be a difficult analysis to perform, but can always be legal so 2563 // there's no need to analyze it. 2564 return nullptr; 2565 } 2566 2567 return MI.getOperand(0).getMBB(); 2568 } 2569 2570 bool SIInstrInfo::hasDivergentBranch(const MachineBasicBlock *MBB) const { 2571 for (const MachineInstr &MI : MBB->terminators()) { 2572 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || 2573 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE || 2574 MI.getOpcode() == AMDGPU::SI_LOOP) 2575 return true; 2576 } 2577 return false; 2578 } 2579 2580 void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, 2581 MachineBasicBlock &DestBB, 2582 MachineBasicBlock &RestoreBB, 2583 const DebugLoc &DL, int64_t BrOffset, 2584 RegScavenger *RS) const { 2585 assert(RS && "RegScavenger required for long branching"); 2586 assert(MBB.empty() && 2587 "new block should be inserted for expanding unconditional branch"); 2588 assert(MBB.pred_size() == 1); 2589 assert(RestoreBB.empty() && 2590 "restore block should be inserted for restoring clobbered registers"); 2591 2592 MachineFunction *MF = MBB.getParent(); 2593 MachineRegisterInfo &MRI = MF->getRegInfo(); 2594 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 2595 2596 // FIXME: Virtual register workaround for RegScavenger not working with empty 2597 // blocks. 2598 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 2599 2600 auto I = MBB.end(); 2601 2602 // We need to compute the offset relative to the instruction immediately after 2603 // s_getpc_b64. Insert pc arithmetic code before last terminator. 2604 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); 2605 2606 auto &MCCtx = MF->getContext(); 2607 MCSymbol *PostGetPCLabel = 2608 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); 2609 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); 2610 2611 MCSymbol *OffsetLo = 2612 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); 2613 MCSymbol *OffsetHi = 2614 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); 2615 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) 2616 .addReg(PCReg, RegState::Define, AMDGPU::sub0) 2617 .addReg(PCReg, 0, AMDGPU::sub0) 2618 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); 2619 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) 2620 .addReg(PCReg, RegState::Define, AMDGPU::sub1) 2621 .addReg(PCReg, 0, AMDGPU::sub1) 2622 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); 2623 2624 // Insert the indirect branch after the other terminator. 2625 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) 2626 .addReg(PCReg); 2627 2628 // If a spill is needed for the pc register pair, we need to insert a spill 2629 // restore block right before the destination block, and insert a short branch 2630 // into the old destination block's fallthrough predecessor. 2631 // e.g.: 2632 // 2633 // s_cbranch_scc0 skip_long_branch: 2634 // 2635 // long_branch_bb: 2636 // spill s[8:9] 2637 // s_getpc_b64 s[8:9] 2638 // s_add_u32 s8, s8, restore_bb 2639 // s_addc_u32 s9, s9, 0 2640 // s_setpc_b64 s[8:9] 2641 // 2642 // skip_long_branch: 2643 // foo; 2644 // 2645 // ..... 2646 // 2647 // dest_bb_fallthrough_predecessor: 2648 // bar; 2649 // s_branch dest_bb 2650 // 2651 // restore_bb: 2652 // restore s[8:9] 2653 // fallthrough dest_bb 2654 /// 2655 // dest_bb: 2656 // buzz; 2657 2658 Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); 2659 Register Scav; 2660 2661 // If we've previously reserved a register for long branches 2662 // avoid running the scavenger and just use those registers 2663 if (LongBranchReservedReg) { 2664 RS->enterBasicBlock(MBB); 2665 Scav = LongBranchReservedReg; 2666 } else { 2667 RS->enterBasicBlockEnd(MBB); 2668 Scav = RS->scavengeRegisterBackwards( 2669 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), 2670 /* RestoreAfter */ false, 0, /* AllowSpill */ false); 2671 } 2672 if (Scav) { 2673 RS->setRegUsed(Scav); 2674 MRI.replaceRegWith(PCReg, Scav); 2675 MRI.clearVirtRegs(); 2676 } else { 2677 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for 2678 // SGPR spill. 2679 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); 2680 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 2681 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS); 2682 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1); 2683 MRI.clearVirtRegs(); 2684 } 2685 2686 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol(); 2687 // Now, the distance could be defined. 2688 auto *Offset = MCBinaryExpr::createSub( 2689 MCSymbolRefExpr::create(DestLabel, MCCtx), 2690 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); 2691 // Add offset assignments. 2692 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); 2693 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); 2694 auto *ShAmt = MCConstantExpr::create(32, MCCtx); 2695 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); 2696 } 2697 2698 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { 2699 switch (Cond) { 2700 case SIInstrInfo::SCC_TRUE: 2701 return AMDGPU::S_CBRANCH_SCC1; 2702 case SIInstrInfo::SCC_FALSE: 2703 return AMDGPU::S_CBRANCH_SCC0; 2704 case SIInstrInfo::VCCNZ: 2705 return AMDGPU::S_CBRANCH_VCCNZ; 2706 case SIInstrInfo::VCCZ: 2707 return AMDGPU::S_CBRANCH_VCCZ; 2708 case SIInstrInfo::EXECNZ: 2709 return AMDGPU::S_CBRANCH_EXECNZ; 2710 case SIInstrInfo::EXECZ: 2711 return AMDGPU::S_CBRANCH_EXECZ; 2712 default: 2713 llvm_unreachable("invalid branch predicate"); 2714 } 2715 } 2716 2717 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { 2718 switch (Opcode) { 2719 case AMDGPU::S_CBRANCH_SCC0: 2720 return SCC_FALSE; 2721 case AMDGPU::S_CBRANCH_SCC1: 2722 return SCC_TRUE; 2723 case AMDGPU::S_CBRANCH_VCCNZ: 2724 return VCCNZ; 2725 case AMDGPU::S_CBRANCH_VCCZ: 2726 return VCCZ; 2727 case AMDGPU::S_CBRANCH_EXECNZ: 2728 return EXECNZ; 2729 case AMDGPU::S_CBRANCH_EXECZ: 2730 return EXECZ; 2731 default: 2732 return INVALID_BR; 2733 } 2734 } 2735 2736 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, 2737 MachineBasicBlock::iterator I, 2738 MachineBasicBlock *&TBB, 2739 MachineBasicBlock *&FBB, 2740 SmallVectorImpl<MachineOperand> &Cond, 2741 bool AllowModify) const { 2742 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2743 // Unconditional Branch 2744 TBB = I->getOperand(0).getMBB(); 2745 return false; 2746 } 2747 2748 MachineBasicBlock *CondBB = nullptr; 2749 2750 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 2751 CondBB = I->getOperand(1).getMBB(); 2752 Cond.push_back(I->getOperand(0)); 2753 } else { 2754 BranchPredicate Pred = getBranchPredicate(I->getOpcode()); 2755 if (Pred == INVALID_BR) 2756 return true; 2757 2758 CondBB = I->getOperand(0).getMBB(); 2759 Cond.push_back(MachineOperand::CreateImm(Pred)); 2760 Cond.push_back(I->getOperand(1)); // Save the branch register. 2761 } 2762 ++I; 2763 2764 if (I == MBB.end()) { 2765 // Conditional branch followed by fall-through. 2766 TBB = CondBB; 2767 return false; 2768 } 2769 2770 if (I->getOpcode() == AMDGPU::S_BRANCH) { 2771 TBB = CondBB; 2772 FBB = I->getOperand(0).getMBB(); 2773 return false; 2774 } 2775 2776 return true; 2777 } 2778 2779 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, 2780 MachineBasicBlock *&FBB, 2781 SmallVectorImpl<MachineOperand> &Cond, 2782 bool AllowModify) const { 2783 MachineBasicBlock::iterator I = MBB.getFirstTerminator(); 2784 auto E = MBB.end(); 2785 if (I == E) 2786 return false; 2787 2788 // Skip over the instructions that are artificially terminators for special 2789 // exec management. 2790 while (I != E && !I->isBranch() && !I->isReturn()) { 2791 switch (I->getOpcode()) { 2792 case AMDGPU::S_MOV_B64_term: 2793 case AMDGPU::S_XOR_B64_term: 2794 case AMDGPU::S_OR_B64_term: 2795 case AMDGPU::S_ANDN2_B64_term: 2796 case AMDGPU::S_AND_B64_term: 2797 case AMDGPU::S_AND_SAVEEXEC_B64_term: 2798 case AMDGPU::S_MOV_B32_term: 2799 case AMDGPU::S_XOR_B32_term: 2800 case AMDGPU::S_OR_B32_term: 2801 case AMDGPU::S_ANDN2_B32_term: 2802 case AMDGPU::S_AND_B32_term: 2803 case AMDGPU::S_AND_SAVEEXEC_B32_term: 2804 break; 2805 case AMDGPU::SI_IF: 2806 case AMDGPU::SI_ELSE: 2807 case AMDGPU::SI_KILL_I1_TERMINATOR: 2808 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 2809 // FIXME: It's messy that these need to be considered here at all. 2810 return true; 2811 default: 2812 llvm_unreachable("unexpected non-branch terminator inst"); 2813 } 2814 2815 ++I; 2816 } 2817 2818 if (I == E) 2819 return false; 2820 2821 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); 2822 } 2823 2824 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, 2825 int *BytesRemoved) const { 2826 unsigned Count = 0; 2827 unsigned RemovedSize = 0; 2828 for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) { 2829 // Skip over artificial terminators when removing instructions. 2830 if (MI.isBranch() || MI.isReturn()) { 2831 RemovedSize += getInstSizeInBytes(MI); 2832 MI.eraseFromParent(); 2833 ++Count; 2834 } 2835 } 2836 2837 if (BytesRemoved) 2838 *BytesRemoved = RemovedSize; 2839 2840 return Count; 2841 } 2842 2843 // Copy the flags onto the implicit condition register operand. 2844 static void preserveCondRegFlags(MachineOperand &CondReg, 2845 const MachineOperand &OrigCond) { 2846 CondReg.setIsUndef(OrigCond.isUndef()); 2847 CondReg.setIsKill(OrigCond.isKill()); 2848 } 2849 2850 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, 2851 MachineBasicBlock *TBB, 2852 MachineBasicBlock *FBB, 2853 ArrayRef<MachineOperand> Cond, 2854 const DebugLoc &DL, 2855 int *BytesAdded) const { 2856 if (!FBB && Cond.empty()) { 2857 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2858 .addMBB(TBB); 2859 if (BytesAdded) 2860 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 2861 return 1; 2862 } 2863 2864 if(Cond.size() == 1 && Cond[0].isReg()) { 2865 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) 2866 .add(Cond[0]) 2867 .addMBB(TBB); 2868 return 1; 2869 } 2870 2871 assert(TBB && Cond[0].isImm()); 2872 2873 unsigned Opcode 2874 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); 2875 2876 if (!FBB) { 2877 Cond[1].isUndef(); 2878 MachineInstr *CondBr = 2879 BuildMI(&MBB, DL, get(Opcode)) 2880 .addMBB(TBB); 2881 2882 // Copy the flags onto the implicit condition register operand. 2883 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); 2884 fixImplicitOperands(*CondBr); 2885 2886 if (BytesAdded) 2887 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; 2888 return 1; 2889 } 2890 2891 assert(TBB && FBB); 2892 2893 MachineInstr *CondBr = 2894 BuildMI(&MBB, DL, get(Opcode)) 2895 .addMBB(TBB); 2896 fixImplicitOperands(*CondBr); 2897 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) 2898 .addMBB(FBB); 2899 2900 MachineOperand &CondReg = CondBr->getOperand(1); 2901 CondReg.setIsUndef(Cond[1].isUndef()); 2902 CondReg.setIsKill(Cond[1].isKill()); 2903 2904 if (BytesAdded) 2905 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; 2906 2907 return 2; 2908 } 2909 2910 bool SIInstrInfo::reverseBranchCondition( 2911 SmallVectorImpl<MachineOperand> &Cond) const { 2912 if (Cond.size() != 2) { 2913 return true; 2914 } 2915 2916 if (Cond[0].isImm()) { 2917 Cond[0].setImm(-Cond[0].getImm()); 2918 return false; 2919 } 2920 2921 return true; 2922 } 2923 2924 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 2925 ArrayRef<MachineOperand> Cond, 2926 Register DstReg, Register TrueReg, 2927 Register FalseReg, int &CondCycles, 2928 int &TrueCycles, int &FalseCycles) const { 2929 switch (Cond[0].getImm()) { 2930 case VCCNZ: 2931 case VCCZ: { 2932 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2933 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2934 if (MRI.getRegClass(FalseReg) != RC) 2935 return false; 2936 2937 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32; 2938 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2939 2940 // Limit to equal cost for branch vs. N v_cndmask_b32s. 2941 return RI.hasVGPRs(RC) && NumInsts <= 6; 2942 } 2943 case SCC_TRUE: 2944 case SCC_FALSE: { 2945 // FIXME: We could insert for VGPRs if we could replace the original compare 2946 // with a vector one. 2947 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2948 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); 2949 if (MRI.getRegClass(FalseReg) != RC) 2950 return false; 2951 2952 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32; 2953 2954 // Multiples of 8 can do s_cselect_b64 2955 if (NumInsts % 2 == 0) 2956 NumInsts /= 2; 2957 2958 CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? 2959 return RI.isSGPRClass(RC); 2960 } 2961 default: 2962 return false; 2963 } 2964 } 2965 2966 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, 2967 MachineBasicBlock::iterator I, const DebugLoc &DL, 2968 Register DstReg, ArrayRef<MachineOperand> Cond, 2969 Register TrueReg, Register FalseReg) const { 2970 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); 2971 if (Pred == VCCZ || Pred == SCC_FALSE) { 2972 Pred = static_cast<BranchPredicate>(-Pred); 2973 std::swap(TrueReg, FalseReg); 2974 } 2975 2976 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 2977 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); 2978 unsigned DstSize = RI.getRegSizeInBits(*DstRC); 2979 2980 if (DstSize == 32) { 2981 MachineInstr *Select; 2982 if (Pred == SCC_TRUE) { 2983 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) 2984 .addReg(TrueReg) 2985 .addReg(FalseReg); 2986 } else { 2987 // Instruction's operands are backwards from what is expected. 2988 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) 2989 .addReg(FalseReg) 2990 .addReg(TrueReg); 2991 } 2992 2993 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 2994 return; 2995 } 2996 2997 if (DstSize == 64 && Pred == SCC_TRUE) { 2998 MachineInstr *Select = 2999 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) 3000 .addReg(TrueReg) 3001 .addReg(FalseReg); 3002 3003 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 3004 return; 3005 } 3006 3007 static const int16_t Sub0_15[] = { 3008 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 3009 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 3010 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, 3011 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 3012 }; 3013 3014 static const int16_t Sub0_15_64[] = { 3015 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, 3016 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, 3017 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, 3018 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, 3019 }; 3020 3021 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; 3022 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; 3023 const int16_t *SubIndices = Sub0_15; 3024 int NElts = DstSize / 32; 3025 3026 // 64-bit select is only available for SALU. 3027 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit. 3028 if (Pred == SCC_TRUE) { 3029 if (NElts % 2) { 3030 SelOp = AMDGPU::S_CSELECT_B32; 3031 EltRC = &AMDGPU::SGPR_32RegClass; 3032 } else { 3033 SelOp = AMDGPU::S_CSELECT_B64; 3034 EltRC = &AMDGPU::SGPR_64RegClass; 3035 SubIndices = Sub0_15_64; 3036 NElts /= 2; 3037 } 3038 } 3039 3040 MachineInstrBuilder MIB = BuildMI( 3041 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); 3042 3043 I = MIB->getIterator(); 3044 3045 SmallVector<Register, 8> Regs; 3046 for (int Idx = 0; Idx != NElts; ++Idx) { 3047 Register DstElt = MRI.createVirtualRegister(EltRC); 3048 Regs.push_back(DstElt); 3049 3050 unsigned SubIdx = SubIndices[Idx]; 3051 3052 MachineInstr *Select; 3053 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { 3054 Select = 3055 BuildMI(MBB, I, DL, get(SelOp), DstElt) 3056 .addReg(FalseReg, 0, SubIdx) 3057 .addReg(TrueReg, 0, SubIdx); 3058 } else { 3059 Select = 3060 BuildMI(MBB, I, DL, get(SelOp), DstElt) 3061 .addReg(TrueReg, 0, SubIdx) 3062 .addReg(FalseReg, 0, SubIdx); 3063 } 3064 3065 preserveCondRegFlags(Select->getOperand(3), Cond[1]); 3066 fixImplicitOperands(*Select); 3067 3068 MIB.addReg(DstElt) 3069 .addImm(SubIdx); 3070 } 3071 } 3072 3073 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { 3074 switch (MI.getOpcode()) { 3075 case AMDGPU::V_MOV_B32_e32: 3076 case AMDGPU::V_MOV_B32_e64: 3077 case AMDGPU::V_MOV_B64_PSEUDO: 3078 case AMDGPU::V_MOV_B64_e32: 3079 case AMDGPU::V_MOV_B64_e64: 3080 case AMDGPU::S_MOV_B32: 3081 case AMDGPU::S_MOV_B64: 3082 case AMDGPU::COPY: 3083 case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 3084 case AMDGPU::V_ACCVGPR_READ_B32_e64: 3085 case AMDGPU::V_ACCVGPR_MOV_B32: 3086 return true; 3087 default: 3088 return false; 3089 } 3090 } 3091 3092 static constexpr unsigned ModifierOpNames[] = { 3093 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers, 3094 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp, 3095 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel}; 3096 3097 void SIInstrInfo::removeModOperands(MachineInstr &MI) const { 3098 unsigned Opc = MI.getOpcode(); 3099 for (unsigned Name : reverse(ModifierOpNames)) { 3100 int Idx = AMDGPU::getNamedOperandIdx(Opc, Name); 3101 if (Idx >= 0) 3102 MI.removeOperand(Idx); 3103 } 3104 } 3105 3106 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, 3107 Register Reg, MachineRegisterInfo *MRI) const { 3108 if (!MRI->hasOneNonDBGUse(Reg)) 3109 return false; 3110 3111 switch (DefMI.getOpcode()) { 3112 default: 3113 return false; 3114 case AMDGPU::S_MOV_B64: 3115 // TODO: We could fold 64-bit immediates, but this get complicated 3116 // when there are sub-registers. 3117 return false; 3118 3119 case AMDGPU::V_MOV_B32_e32: 3120 case AMDGPU::S_MOV_B32: 3121 case AMDGPU::V_ACCVGPR_WRITE_B32_e64: 3122 break; 3123 } 3124 3125 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); 3126 assert(ImmOp); 3127 // FIXME: We could handle FrameIndex values here. 3128 if (!ImmOp->isImm()) 3129 return false; 3130 3131 unsigned Opc = UseMI.getOpcode(); 3132 if (Opc == AMDGPU::COPY) { 3133 Register DstReg = UseMI.getOperand(0).getReg(); 3134 bool Is16Bit = getOpSize(UseMI, 0) == 2; 3135 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); 3136 unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; 3137 APInt Imm(32, ImmOp->getImm()); 3138 3139 if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) 3140 Imm = Imm.ashr(16); 3141 3142 if (RI.isAGPR(*MRI, DstReg)) { 3143 if (!isInlineConstant(Imm)) 3144 return false; 3145 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; 3146 } 3147 3148 if (Is16Bit) { 3149 if (isVGPRCopy) 3150 return false; // Do not clobber vgpr_hi16 3151 3152 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) 3153 return false; 3154 3155 UseMI.getOperand(0).setSubReg(0); 3156 if (DstReg.isPhysical()) { 3157 DstReg = RI.get32BitRegister(DstReg); 3158 UseMI.getOperand(0).setReg(DstReg); 3159 } 3160 assert(UseMI.getOperand(1).getReg().isVirtual()); 3161 } 3162 3163 const MCInstrDesc &NewMCID = get(NewOpc); 3164 if (DstReg.isPhysical() && 3165 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg)) 3166 return false; 3167 3168 UseMI.setDesc(NewMCID); 3169 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); 3170 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); 3171 return true; 3172 } 3173 3174 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 3175 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 3176 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 3177 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || 3178 Opc == AMDGPU::V_FMAC_F16_t16_e64) { 3179 // Don't fold if we are using source or output modifiers. The new VOP2 3180 // instructions don't have them. 3181 if (hasAnyModifiersSet(UseMI)) 3182 return false; 3183 3184 // If this is a free constant, there's no reason to do this. 3185 // TODO: We could fold this here instead of letting SIFoldOperands do it 3186 // later. 3187 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); 3188 3189 // Any src operand can be used for the legality check. 3190 if (isInlineConstant(UseMI, *Src0, *ImmOp)) 3191 return false; 3192 3193 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || 3194 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; 3195 bool IsFMA = 3196 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || 3197 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || 3198 Opc == AMDGPU::V_FMAC_F16_t16_e64; 3199 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); 3200 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); 3201 3202 // Multiplied part is the constant: Use v_madmk_{f16, f32}. 3203 // We should only expect these to be on src0 due to canonicalization. 3204 if (Src0->isReg() && Src0->getReg() == Reg) { 3205 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) 3206 return false; 3207 3208 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) 3209 return false; 3210 3211 unsigned NewOpc = 3212 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 3213 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 3214 : AMDGPU::V_FMAMK_F16) 3215 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); 3216 if (pseudoToMCOpcode(NewOpc) == -1) 3217 return false; 3218 3219 // We need to swap operands 0 and 1 since madmk constant is at operand 1. 3220 3221 const int64_t Imm = ImmOp->getImm(); 3222 3223 // FIXME: This would be a lot easier if we could return a new instruction 3224 // instead of having to modify in place. 3225 3226 Register Src1Reg = Src1->getReg(); 3227 unsigned Src1SubReg = Src1->getSubReg(); 3228 Src0->setReg(Src1Reg); 3229 Src0->setSubReg(Src1SubReg); 3230 Src0->setIsKill(Src1->isKill()); 3231 3232 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 3233 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || 3234 Opc == AMDGPU::V_FMAC_F16_e64) 3235 UseMI.untieRegOperand( 3236 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 3237 3238 Src1->ChangeToImmediate(Imm); 3239 3240 removeModOperands(UseMI); 3241 UseMI.setDesc(get(NewOpc)); 3242 3243 bool DeleteDef = MRI->use_nodbg_empty(Reg); 3244 if (DeleteDef) 3245 DefMI.eraseFromParent(); 3246 3247 return true; 3248 } 3249 3250 // Added part is the constant: Use v_madak_{f16, f32}. 3251 if (Src2->isReg() && Src2->getReg() == Reg) { 3252 // Not allowed to use constant bus for another operand. 3253 // We can however allow an inline immediate as src0. 3254 bool Src0Inlined = false; 3255 if (Src0->isReg()) { 3256 // Try to inline constant if possible. 3257 // If the Def moves immediate and the use is single 3258 // We are saving VGPR here. 3259 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); 3260 if (Def && Def->isMoveImmediate() && 3261 isInlineConstant(Def->getOperand(1)) && 3262 MRI->hasOneUse(Src0->getReg())) { 3263 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 3264 Src0Inlined = true; 3265 } else if ((Src0->getReg().isPhysical() && 3266 (ST.getConstantBusLimit(Opc) <= 1 && 3267 RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) || 3268 (Src0->getReg().isVirtual() && 3269 (ST.getConstantBusLimit(Opc) <= 1 && 3270 RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) 3271 return false; 3272 // VGPR is okay as Src0 - fallthrough 3273 } 3274 3275 if (Src1->isReg() && !Src0Inlined ) { 3276 // We have one slot for inlinable constant so far - try to fill it 3277 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); 3278 if (Def && Def->isMoveImmediate() && 3279 isInlineConstant(Def->getOperand(1)) && 3280 MRI->hasOneUse(Src1->getReg()) && 3281 commuteInstruction(UseMI)) { 3282 Src0->ChangeToImmediate(Def->getOperand(1).getImm()); 3283 } else if ((Src1->getReg().isPhysical() && 3284 RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) || 3285 (Src1->getReg().isVirtual() && 3286 RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) 3287 return false; 3288 // VGPR is okay as Src1 - fallthrough 3289 } 3290 3291 unsigned NewOpc = 3292 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 3293 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 3294 : AMDGPU::V_FMAAK_F16) 3295 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); 3296 if (pseudoToMCOpcode(NewOpc) == -1) 3297 return false; 3298 3299 const int64_t Imm = ImmOp->getImm(); 3300 3301 // FIXME: This would be a lot easier if we could return a new instruction 3302 // instead of having to modify in place. 3303 3304 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || 3305 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || 3306 Opc == AMDGPU::V_FMAC_F16_e64) 3307 UseMI.untieRegOperand( 3308 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); 3309 3310 // ChangingToImmediate adds Src2 back to the instruction. 3311 Src2->ChangeToImmediate(Imm); 3312 3313 // These come before src2. 3314 removeModOperands(UseMI); 3315 UseMI.setDesc(get(NewOpc)); 3316 // It might happen that UseMI was commuted 3317 // and we now have SGPR as SRC1. If so 2 inlined 3318 // constant and SGPR are illegal. 3319 legalizeOperands(UseMI); 3320 3321 bool DeleteDef = MRI->use_nodbg_empty(Reg); 3322 if (DeleteDef) 3323 DefMI.eraseFromParent(); 3324 3325 return true; 3326 } 3327 } 3328 3329 return false; 3330 } 3331 3332 static bool 3333 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, 3334 ArrayRef<const MachineOperand *> BaseOps2) { 3335 if (BaseOps1.size() != BaseOps2.size()) 3336 return false; 3337 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { 3338 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) 3339 return false; 3340 } 3341 return true; 3342 } 3343 3344 static bool offsetsDoNotOverlap(int WidthA, int OffsetA, 3345 int WidthB, int OffsetB) { 3346 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 3347 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 3348 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 3349 return LowOffset + LowWidth <= HighOffset; 3350 } 3351 3352 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, 3353 const MachineInstr &MIb) const { 3354 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; 3355 int64_t Offset0, Offset1; 3356 unsigned Dummy0, Dummy1; 3357 bool Offset0IsScalable, Offset1IsScalable; 3358 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, 3359 Dummy0, &RI) || 3360 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, 3361 Dummy1, &RI)) 3362 return false; 3363 3364 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) 3365 return false; 3366 3367 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { 3368 // FIXME: Handle ds_read2 / ds_write2. 3369 return false; 3370 } 3371 unsigned Width0 = MIa.memoperands().front()->getSize(); 3372 unsigned Width1 = MIb.memoperands().front()->getSize(); 3373 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); 3374 } 3375 3376 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, 3377 const MachineInstr &MIb) const { 3378 assert(MIa.mayLoadOrStore() && 3379 "MIa must load from or modify a memory location"); 3380 assert(MIb.mayLoadOrStore() && 3381 "MIb must load from or modify a memory location"); 3382 3383 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) 3384 return false; 3385 3386 // XXX - Can we relax this between address spaces? 3387 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 3388 return false; 3389 3390 // TODO: Should we check the address space from the MachineMemOperand? That 3391 // would allow us to distinguish objects we know don't alias based on the 3392 // underlying address space, even if it was lowered to a different one, 3393 // e.g. private accesses lowered to use MUBUF instructions on a scratch 3394 // buffer. 3395 if (isDS(MIa)) { 3396 if (isDS(MIb)) 3397 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3398 3399 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); 3400 } 3401 3402 if (isMUBUF(MIa) || isMTBUF(MIa)) { 3403 if (isMUBUF(MIb) || isMTBUF(MIb)) 3404 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3405 3406 return !isFLAT(MIb) && !isSMRD(MIb); 3407 } 3408 3409 if (isSMRD(MIa)) { 3410 if (isSMRD(MIb)) 3411 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3412 3413 return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); 3414 } 3415 3416 if (isFLAT(MIa)) { 3417 if (isFLAT(MIb)) 3418 return checkInstOffsetsDoNotOverlap(MIa, MIb); 3419 3420 return false; 3421 } 3422 3423 return false; 3424 } 3425 3426 static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, 3427 int64_t &Imm, MachineInstr **DefMI = nullptr) { 3428 if (Reg.isPhysical()) 3429 return false; 3430 auto *Def = MRI.getUniqueVRegDef(Reg); 3431 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { 3432 Imm = Def->getOperand(1).getImm(); 3433 if (DefMI) 3434 *DefMI = Def; 3435 return true; 3436 } 3437 return false; 3438 } 3439 3440 static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, 3441 MachineInstr **DefMI = nullptr) { 3442 if (!MO->isReg()) 3443 return false; 3444 const MachineFunction *MF = MO->getParent()->getParent()->getParent(); 3445 const MachineRegisterInfo &MRI = MF->getRegInfo(); 3446 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); 3447 } 3448 3449 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, 3450 MachineInstr &NewMI) { 3451 if (LV) { 3452 unsigned NumOps = MI.getNumOperands(); 3453 for (unsigned I = 1; I < NumOps; ++I) { 3454 MachineOperand &Op = MI.getOperand(I); 3455 if (Op.isReg() && Op.isKill()) 3456 LV->replaceKillInstruction(Op.getReg(), MI, NewMI); 3457 } 3458 } 3459 } 3460 3461 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, 3462 LiveVariables *LV, 3463 LiveIntervals *LIS) const { 3464 MachineBasicBlock &MBB = *MI.getParent(); 3465 unsigned Opc = MI.getOpcode(); 3466 3467 // Handle MFMA. 3468 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc); 3469 if (NewMFMAOpc != -1) { 3470 MachineInstrBuilder MIB = 3471 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc)); 3472 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) 3473 MIB.add(MI.getOperand(I)); 3474 updateLiveVariables(LV, MI, *MIB); 3475 if (LIS) 3476 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3477 return MIB; 3478 } 3479 3480 if (SIInstrInfo::isWMMA(MI)) { 3481 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode()); 3482 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3483 .setMIFlags(MI.getFlags()); 3484 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) 3485 MIB->addOperand(MI.getOperand(I)); 3486 3487 updateLiveVariables(LV, MI, *MIB); 3488 if (LIS) 3489 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3490 3491 return MIB; 3492 } 3493 3494 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 && 3495 "V_FMAC_F16_t16_e32 is not supported and not expected to be present " 3496 "pre-RA"); 3497 3498 // Handle MAC/FMAC. 3499 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || 3500 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || 3501 Opc == AMDGPU::V_FMAC_F16_t16_e64; 3502 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || 3503 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || 3504 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || 3505 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || 3506 Opc == AMDGPU::V_FMAC_F16_t16_e64 || 3507 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 3508 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; 3509 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || 3510 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || 3511 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || 3512 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64; 3513 bool Src0Literal = false; 3514 3515 switch (Opc) { 3516 default: 3517 return nullptr; 3518 case AMDGPU::V_MAC_F16_e64: 3519 case AMDGPU::V_FMAC_F16_e64: 3520 case AMDGPU::V_FMAC_F16_t16_e64: 3521 case AMDGPU::V_MAC_F32_e64: 3522 case AMDGPU::V_MAC_LEGACY_F32_e64: 3523 case AMDGPU::V_FMAC_F32_e64: 3524 case AMDGPU::V_FMAC_LEGACY_F32_e64: 3525 case AMDGPU::V_FMAC_F64_e64: 3526 break; 3527 case AMDGPU::V_MAC_F16_e32: 3528 case AMDGPU::V_FMAC_F16_e32: 3529 case AMDGPU::V_MAC_F32_e32: 3530 case AMDGPU::V_MAC_LEGACY_F32_e32: 3531 case AMDGPU::V_FMAC_F32_e32: 3532 case AMDGPU::V_FMAC_LEGACY_F32_e32: 3533 case AMDGPU::V_FMAC_F64_e32: { 3534 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3535 AMDGPU::OpName::src0); 3536 const MachineOperand *Src0 = &MI.getOperand(Src0Idx); 3537 if (!Src0->isReg() && !Src0->isImm()) 3538 return nullptr; 3539 3540 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) 3541 Src0Literal = true; 3542 3543 break; 3544 } 3545 } 3546 3547 MachineInstrBuilder MIB; 3548 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 3549 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); 3550 const MachineOperand *Src0Mods = 3551 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); 3552 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 3553 const MachineOperand *Src1Mods = 3554 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); 3555 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3556 const MachineOperand *Src2Mods = 3557 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers); 3558 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 3559 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); 3560 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel); 3561 3562 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && 3563 !IsLegacy && 3564 // If we have an SGPR input, we will violate the constant bus restriction. 3565 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || 3566 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { 3567 MachineInstr *DefMI; 3568 const auto killDef = [&]() -> void { 3569 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 3570 // The only user is the instruction which will be killed. 3571 Register DefReg = DefMI->getOperand(0).getReg(); 3572 if (!MRI.hasOneNonDBGUse(DefReg)) 3573 return; 3574 // We cannot just remove the DefMI here, calling pass will crash. 3575 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); 3576 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) 3577 DefMI->removeOperand(I); 3578 if (LV) 3579 LV->getVarInfo(DefReg).AliveBlocks.clear(); 3580 }; 3581 3582 int64_t Imm; 3583 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { 3584 unsigned NewOpc = 3585 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 3586 : AMDGPU::V_FMAAK_F16) 3587 : AMDGPU::V_FMAAK_F32) 3588 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); 3589 if (pseudoToMCOpcode(NewOpc) != -1) { 3590 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3591 .add(*Dst) 3592 .add(*Src0) 3593 .add(*Src1) 3594 .addImm(Imm); 3595 updateLiveVariables(LV, MI, *MIB); 3596 if (LIS) 3597 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3598 killDef(); 3599 return MIB; 3600 } 3601 } 3602 unsigned NewOpc = 3603 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 3604 : AMDGPU::V_FMAMK_F16) 3605 : AMDGPU::V_FMAMK_F32) 3606 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); 3607 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { 3608 if (pseudoToMCOpcode(NewOpc) != -1) { 3609 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3610 .add(*Dst) 3611 .add(*Src0) 3612 .addImm(Imm) 3613 .add(*Src2); 3614 updateLiveVariables(LV, MI, *MIB); 3615 if (LIS) 3616 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3617 killDef(); 3618 return MIB; 3619 } 3620 } 3621 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) { 3622 if (Src0Literal) { 3623 Imm = Src0->getImm(); 3624 DefMI = nullptr; 3625 } 3626 if (pseudoToMCOpcode(NewOpc) != -1 && 3627 isOperandLegal( 3628 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), 3629 Src1)) { 3630 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3631 .add(*Dst) 3632 .add(*Src1) 3633 .addImm(Imm) 3634 .add(*Src2); 3635 updateLiveVariables(LV, MI, *MIB); 3636 if (LIS) 3637 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3638 if (DefMI) 3639 killDef(); 3640 return MIB; 3641 } 3642 } 3643 } 3644 3645 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma 3646 // if VOP3 does not allow a literal operand. 3647 if (Src0Literal && !ST.hasVOP3Literal()) 3648 return nullptr; 3649 3650 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 3651 : IsF64 ? AMDGPU::V_FMA_F64_e64 3652 : IsLegacy 3653 ? AMDGPU::V_FMA_LEGACY_F32_e64 3654 : AMDGPU::V_FMA_F32_e64 3655 : IsF16 ? AMDGPU::V_MAD_F16_e64 3656 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64 3657 : AMDGPU::V_MAD_F32_e64; 3658 if (pseudoToMCOpcode(NewOpc) == -1) 3659 return nullptr; 3660 3661 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) 3662 .add(*Dst) 3663 .addImm(Src0Mods ? Src0Mods->getImm() : 0) 3664 .add(*Src0) 3665 .addImm(Src1Mods ? Src1Mods->getImm() : 0) 3666 .add(*Src1) 3667 .addImm(Src2Mods ? Src2Mods->getImm() : 0) 3668 .add(*Src2) 3669 .addImm(Clamp ? Clamp->getImm() : 0) 3670 .addImm(Omod ? Omod->getImm() : 0); 3671 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) 3672 MIB.addImm(OpSel ? OpSel->getImm() : 0); 3673 updateLiveVariables(LV, MI, *MIB); 3674 if (LIS) 3675 LIS->ReplaceMachineInstrInMaps(MI, *MIB); 3676 return MIB; 3677 } 3678 3679 // It's not generally safe to move VALU instructions across these since it will 3680 // start using the register as a base index rather than directly. 3681 // XXX - Why isn't hasSideEffects sufficient for these? 3682 static bool changesVGPRIndexingMode(const MachineInstr &MI) { 3683 switch (MI.getOpcode()) { 3684 case AMDGPU::S_SET_GPR_IDX_ON: 3685 case AMDGPU::S_SET_GPR_IDX_MODE: 3686 case AMDGPU::S_SET_GPR_IDX_OFF: 3687 return true; 3688 default: 3689 return false; 3690 } 3691 } 3692 3693 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, 3694 const MachineBasicBlock *MBB, 3695 const MachineFunction &MF) const { 3696 // Skipping the check for SP writes in the base implementation. The reason it 3697 // was added was apparently due to compile time concerns. 3698 // 3699 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops 3700 // but is probably avoidable. 3701 3702 // Copied from base implementation. 3703 // Terminators and labels can't be scheduled around. 3704 if (MI.isTerminator() || MI.isPosition()) 3705 return true; 3706 3707 // INLINEASM_BR can jump to another block 3708 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) 3709 return true; 3710 3711 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0) 3712 return true; 3713 3714 // Target-independent instructions do not have an implicit-use of EXEC, even 3715 // when they operate on VGPRs. Treating EXEC modifications as scheduling 3716 // boundaries prevents incorrect movements of such instructions. 3717 return MI.modifiesRegister(AMDGPU::EXEC, &RI) || 3718 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || 3719 MI.getOpcode() == AMDGPU::S_SETREG_B32 || 3720 MI.getOpcode() == AMDGPU::S_SETPRIO || 3721 changesVGPRIndexingMode(MI); 3722 } 3723 3724 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { 3725 return Opcode == AMDGPU::DS_ORDERED_COUNT || 3726 Opcode == AMDGPU::DS_GWS_INIT || 3727 Opcode == AMDGPU::DS_GWS_SEMA_V || 3728 Opcode == AMDGPU::DS_GWS_SEMA_BR || 3729 Opcode == AMDGPU::DS_GWS_SEMA_P || 3730 Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || 3731 Opcode == AMDGPU::DS_GWS_BARRIER; 3732 } 3733 3734 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { 3735 // Skip the full operand and register alias search modifiesRegister 3736 // does. There's only a handful of instructions that touch this, it's only an 3737 // implicit def, and doesn't alias any other registers. 3738 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE); 3739 } 3740 3741 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { 3742 unsigned Opcode = MI.getOpcode(); 3743 3744 if (MI.mayStore() && isSMRD(MI)) 3745 return true; // scalar store or atomic 3746 3747 // This will terminate the function when other lanes may need to continue. 3748 if (MI.isReturn()) 3749 return true; 3750 3751 // These instructions cause shader I/O that may cause hardware lockups 3752 // when executed with an empty EXEC mask. 3753 // 3754 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when 3755 // EXEC = 0, but checking for that case here seems not worth it 3756 // given the typical code patterns. 3757 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || 3758 isEXP(Opcode) || 3759 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || 3760 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) 3761 return true; 3762 3763 if (MI.isCall() || MI.isInlineAsm()) 3764 return true; // conservative assumption 3765 3766 // A mode change is a scalar operation that influences vector instructions. 3767 if (modifiesModeRegister(MI)) 3768 return true; 3769 3770 // These are like SALU instructions in terms of effects, so it's questionable 3771 // whether we should return true for those. 3772 // 3773 // However, executing them with EXEC = 0 causes them to operate on undefined 3774 // data, which we avoid by returning true here. 3775 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || 3776 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) 3777 return true; 3778 3779 return false; 3780 } 3781 3782 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI, 3783 const MachineInstr &MI) const { 3784 if (MI.isMetaInstruction()) 3785 return false; 3786 3787 // This won't read exec if this is an SGPR->SGPR copy. 3788 if (MI.isCopyLike()) { 3789 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg())) 3790 return true; 3791 3792 // Make sure this isn't copying exec as a normal operand 3793 return MI.readsRegister(AMDGPU::EXEC, &RI); 3794 } 3795 3796 // Make a conservative assumption about the callee. 3797 if (MI.isCall()) 3798 return true; 3799 3800 // Be conservative with any unhandled generic opcodes. 3801 if (!isTargetSpecificOpcode(MI.getOpcode())) 3802 return true; 3803 3804 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI); 3805 } 3806 3807 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { 3808 switch (Imm.getBitWidth()) { 3809 case 1: // This likely will be a condition code mask. 3810 return true; 3811 3812 case 32: 3813 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), 3814 ST.hasInv2PiInlineImm()); 3815 case 64: 3816 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), 3817 ST.hasInv2PiInlineImm()); 3818 case 16: 3819 return ST.has16BitInsts() && 3820 AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), 3821 ST.hasInv2PiInlineImm()); 3822 default: 3823 llvm_unreachable("invalid bitwidth"); 3824 } 3825 } 3826 3827 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, 3828 uint8_t OperandType) const { 3829 assert(!MO.isReg() && "isInlineConstant called on register operand!"); 3830 if (!MO.isImm() || 3831 OperandType < AMDGPU::OPERAND_SRC_FIRST || 3832 OperandType > AMDGPU::OPERAND_SRC_LAST) 3833 return false; 3834 3835 // MachineOperand provides no way to tell the true operand size, since it only 3836 // records a 64-bit value. We need to know the size to determine if a 32-bit 3837 // floating point immediate bit pattern is legal for an integer immediate. It 3838 // would be for any 32-bit integer operand, but would not be for a 64-bit one. 3839 3840 int64_t Imm = MO.getImm(); 3841 switch (OperandType) { 3842 case AMDGPU::OPERAND_REG_IMM_INT32: 3843 case AMDGPU::OPERAND_REG_IMM_FP32: 3844 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: 3845 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 3846 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 3847 case AMDGPU::OPERAND_REG_IMM_V2FP32: 3848 case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: 3849 case AMDGPU::OPERAND_REG_IMM_V2INT32: 3850 case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: 3851 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 3852 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { 3853 int32_t Trunc = static_cast<int32_t>(Imm); 3854 return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); 3855 } 3856 case AMDGPU::OPERAND_REG_IMM_INT64: 3857 case AMDGPU::OPERAND_REG_IMM_FP64: 3858 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 3859 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 3860 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: 3861 return AMDGPU::isInlinableLiteral64(MO.getImm(), 3862 ST.hasInv2PiInlineImm()); 3863 case AMDGPU::OPERAND_REG_IMM_INT16: 3864 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 3865 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 3866 // We would expect inline immediates to not be concerned with an integer/fp 3867 // distinction. However, in the case of 16-bit integer operations, the 3868 // "floating point" values appear to not work. It seems read the low 16-bits 3869 // of 32-bit immediates, which happens to always work for the integer 3870 // values. 3871 // 3872 // See llvm bugzilla 46302. 3873 // 3874 // TODO: Theoretically we could use op-sel to use the high bits of the 3875 // 32-bit FP values. 3876 return AMDGPU::isInlinableIntLiteral(Imm); 3877 case AMDGPU::OPERAND_REG_IMM_V2INT16: 3878 case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: 3879 case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: 3880 // This suffers the same problem as the scalar 16-bit cases. 3881 return AMDGPU::isInlinableIntLiteralV216(Imm); 3882 case AMDGPU::OPERAND_REG_IMM_FP16: 3883 case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: 3884 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 3885 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { 3886 if (isInt<16>(Imm) || isUInt<16>(Imm)) { 3887 // A few special case instructions have 16-bit operands on subtargets 3888 // where 16-bit instructions are not legal. 3889 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle 3890 // constants in these cases 3891 int16_t Trunc = static_cast<int16_t>(Imm); 3892 return ST.has16BitInsts() && 3893 AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); 3894 } 3895 3896 return false; 3897 } 3898 case AMDGPU::OPERAND_REG_IMM_V2FP16: 3899 case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: 3900 case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { 3901 uint32_t Trunc = static_cast<uint32_t>(Imm); 3902 return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); 3903 } 3904 case AMDGPU::OPERAND_KIMM32: 3905 case AMDGPU::OPERAND_KIMM16: 3906 return false; 3907 default: 3908 llvm_unreachable("invalid bitwidth"); 3909 } 3910 } 3911 3912 static bool compareMachineOp(const MachineOperand &Op0, 3913 const MachineOperand &Op1) { 3914 if (Op0.getType() != Op1.getType()) 3915 return false; 3916 3917 switch (Op0.getType()) { 3918 case MachineOperand::MO_Register: 3919 return Op0.getReg() == Op1.getReg(); 3920 case MachineOperand::MO_Immediate: 3921 return Op0.getImm() == Op1.getImm(); 3922 default: 3923 llvm_unreachable("Didn't expect to be comparing these operand types"); 3924 } 3925 } 3926 3927 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, 3928 const MachineOperand &MO) const { 3929 const MCInstrDesc &InstDesc = MI.getDesc(); 3930 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo]; 3931 3932 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 3933 3934 if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE) 3935 return true; 3936 3937 if (OpInfo.RegClass < 0) 3938 return false; 3939 3940 if (MO.isImm() && isInlineConstant(MO, OpInfo)) { 3941 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() && 3942 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(), 3943 AMDGPU::OpName::src2)) 3944 return false; 3945 return RI.opCanUseInlineConstant(OpInfo.OperandType); 3946 } 3947 3948 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType)) 3949 return false; 3950 3951 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo)) 3952 return true; 3953 3954 return ST.hasVOP3Literal(); 3955 } 3956 3957 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { 3958 // GFX90A does not have V_MUL_LEGACY_F32_e32. 3959 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) 3960 return false; 3961 3962 int Op32 = AMDGPU::getVOPe32(Opcode); 3963 if (Op32 == -1) 3964 return false; 3965 3966 return pseudoToMCOpcode(Op32) != -1; 3967 } 3968 3969 bool SIInstrInfo::hasModifiers(unsigned Opcode) const { 3970 // The src0_modifier operand is present on all instructions 3971 // that have modifiers. 3972 3973 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers); 3974 } 3975 3976 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, 3977 unsigned OpName) const { 3978 const MachineOperand *Mods = getNamedOperand(MI, OpName); 3979 return Mods && Mods->getImm(); 3980 } 3981 3982 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { 3983 return any_of(ModifierOpNames, 3984 [&](unsigned Name) { return hasModifiersSet(MI, Name); }); 3985 } 3986 3987 bool SIInstrInfo::canShrink(const MachineInstr &MI, 3988 const MachineRegisterInfo &MRI) const { 3989 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 3990 // Can't shrink instruction with three operands. 3991 if (Src2) { 3992 switch (MI.getOpcode()) { 3993 default: return false; 3994 3995 case AMDGPU::V_ADDC_U32_e64: 3996 case AMDGPU::V_SUBB_U32_e64: 3997 case AMDGPU::V_SUBBREV_U32_e64: { 3998 const MachineOperand *Src1 3999 = getNamedOperand(MI, AMDGPU::OpName::src1); 4000 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) 4001 return false; 4002 // Additional verification is needed for sdst/src2. 4003 return true; 4004 } 4005 case AMDGPU::V_MAC_F16_e64: 4006 case AMDGPU::V_MAC_F32_e64: 4007 case AMDGPU::V_MAC_LEGACY_F32_e64: 4008 case AMDGPU::V_FMAC_F16_e64: 4009 case AMDGPU::V_FMAC_F16_t16_e64: 4010 case AMDGPU::V_FMAC_F32_e64: 4011 case AMDGPU::V_FMAC_F64_e64: 4012 case AMDGPU::V_FMAC_LEGACY_F32_e64: 4013 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || 4014 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) 4015 return false; 4016 break; 4017 4018 case AMDGPU::V_CNDMASK_B32_e64: 4019 break; 4020 } 4021 } 4022 4023 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 4024 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || 4025 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) 4026 return false; 4027 4028 // We don't need to check src0, all input types are legal, so just make sure 4029 // src0 isn't using any modifiers. 4030 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) 4031 return false; 4032 4033 // Can it be shrunk to a valid 32 bit opcode? 4034 if (!hasVALU32BitEncoding(MI.getOpcode())) 4035 return false; 4036 4037 // Check output modifiers 4038 return !hasModifiersSet(MI, AMDGPU::OpName::omod) && 4039 !hasModifiersSet(MI, AMDGPU::OpName::clamp); 4040 } 4041 4042 // Set VCC operand with all flags from \p Orig, except for setting it as 4043 // implicit. 4044 static void copyFlagsToImplicitVCC(MachineInstr &MI, 4045 const MachineOperand &Orig) { 4046 4047 for (MachineOperand &Use : MI.implicit_operands()) { 4048 if (Use.isUse() && 4049 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { 4050 Use.setIsUndef(Orig.isUndef()); 4051 Use.setIsKill(Orig.isKill()); 4052 return; 4053 } 4054 } 4055 } 4056 4057 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, 4058 unsigned Op32) const { 4059 MachineBasicBlock *MBB = MI.getParent(); 4060 MachineInstrBuilder Inst32 = 4061 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) 4062 .setMIFlags(MI.getFlags()); 4063 4064 // Add the dst operand if the 32-bit encoding also has an explicit $vdst. 4065 // For VOPC instructions, this is replaced by an implicit def of vcc. 4066 if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) { 4067 // dst 4068 Inst32.add(MI.getOperand(0)); 4069 } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) { 4070 // VOPCX instructions won't be writing to an explicit dst, so this should 4071 // not fail for these instructions. 4072 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) || 4073 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) && 4074 "Unexpected case"); 4075 } 4076 4077 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); 4078 4079 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); 4080 if (Src1) 4081 Inst32.add(*Src1); 4082 4083 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); 4084 4085 if (Src2) { 4086 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); 4087 if (Op32Src2Idx != -1) { 4088 Inst32.add(*Src2); 4089 } else { 4090 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is 4091 // replaced with an implicit read of vcc or vcc_lo. The implicit read 4092 // of vcc was already added during the initial BuildMI, but we 4093 // 1) may need to change vcc to vcc_lo to preserve the original register 4094 // 2) have to preserve the original flags. 4095 fixImplicitOperands(*Inst32); 4096 copyFlagsToImplicitVCC(*Inst32, *Src2); 4097 } 4098 } 4099 4100 return Inst32; 4101 } 4102 4103 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, 4104 const MachineOperand &MO, 4105 const MCOperandInfo &OpInfo) const { 4106 // Literal constants use the constant bus. 4107 if (!MO.isReg()) 4108 return !isInlineConstant(MO, OpInfo); 4109 4110 if (!MO.isUse()) 4111 return false; 4112 4113 if (MO.getReg().isVirtual()) 4114 return RI.isSGPRClass(MRI.getRegClass(MO.getReg())); 4115 4116 // Null is free 4117 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64) 4118 return false; 4119 4120 // SGPRs use the constant bus 4121 if (MO.isImplicit()) { 4122 return MO.getReg() == AMDGPU::M0 || 4123 MO.getReg() == AMDGPU::VCC || 4124 MO.getReg() == AMDGPU::VCC_LO; 4125 } else { 4126 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) || 4127 AMDGPU::SReg_64RegClass.contains(MO.getReg()); 4128 } 4129 } 4130 4131 static Register findImplicitSGPRRead(const MachineInstr &MI) { 4132 for (const MachineOperand &MO : MI.implicit_operands()) { 4133 // We only care about reads. 4134 if (MO.isDef()) 4135 continue; 4136 4137 switch (MO.getReg()) { 4138 case AMDGPU::VCC: 4139 case AMDGPU::VCC_LO: 4140 case AMDGPU::VCC_HI: 4141 case AMDGPU::M0: 4142 case AMDGPU::FLAT_SCR: 4143 return MO.getReg(); 4144 4145 default: 4146 break; 4147 } 4148 } 4149 4150 return Register(); 4151 } 4152 4153 static bool shouldReadExec(const MachineInstr &MI) { 4154 if (SIInstrInfo::isVALU(MI)) { 4155 switch (MI.getOpcode()) { 4156 case AMDGPU::V_READLANE_B32: 4157 case AMDGPU::V_WRITELANE_B32: 4158 return false; 4159 } 4160 4161 return true; 4162 } 4163 4164 if (MI.isPreISelOpcode() || 4165 SIInstrInfo::isGenericOpcode(MI.getOpcode()) || 4166 SIInstrInfo::isSALU(MI) || 4167 SIInstrInfo::isSMRD(MI)) 4168 return false; 4169 4170 return true; 4171 } 4172 4173 static bool isSubRegOf(const SIRegisterInfo &TRI, 4174 const MachineOperand &SuperVec, 4175 const MachineOperand &SubReg) { 4176 if (SubReg.getReg().isPhysical()) 4177 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); 4178 4179 return SubReg.getSubReg() != AMDGPU::NoSubRegister && 4180 SubReg.getReg() == SuperVec.getReg(); 4181 } 4182 4183 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, 4184 StringRef &ErrInfo) const { 4185 uint16_t Opcode = MI.getOpcode(); 4186 if (SIInstrInfo::isGenericOpcode(MI.getOpcode())) 4187 return true; 4188 4189 const MachineFunction *MF = MI.getParent()->getParent(); 4190 const MachineRegisterInfo &MRI = MF->getRegInfo(); 4191 4192 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); 4193 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); 4194 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); 4195 int Src3Idx = -1; 4196 if (Src0Idx == -1) { 4197 // VOPD V_DUAL_* instructions use different operand names. 4198 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); 4199 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); 4200 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); 4201 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); 4202 } 4203 4204 // Make sure the number of operands is correct. 4205 const MCInstrDesc &Desc = get(Opcode); 4206 if (!Desc.isVariadic() && 4207 Desc.getNumOperands() != MI.getNumExplicitOperands()) { 4208 ErrInfo = "Instruction has wrong number of operands."; 4209 return false; 4210 } 4211 4212 if (MI.isInlineAsm()) { 4213 // Verify register classes for inlineasm constraints. 4214 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); 4215 I != E; ++I) { 4216 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); 4217 if (!RC) 4218 continue; 4219 4220 const MachineOperand &Op = MI.getOperand(I); 4221 if (!Op.isReg()) 4222 continue; 4223 4224 Register Reg = Op.getReg(); 4225 if (!Reg.isVirtual() && !RC->contains(Reg)) { 4226 ErrInfo = "inlineasm operand has incorrect register class."; 4227 return false; 4228 } 4229 } 4230 4231 return true; 4232 } 4233 4234 if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { 4235 ErrInfo = "missing memory operand from MIMG instruction."; 4236 return false; 4237 } 4238 4239 // Make sure the register classes are correct. 4240 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { 4241 const MachineOperand &MO = MI.getOperand(i); 4242 if (MO.isFPImm()) { 4243 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " 4244 "all fp values to integers."; 4245 return false; 4246 } 4247 4248 int RegClass = Desc.operands()[i].RegClass; 4249 4250 switch (Desc.operands()[i].OperandType) { 4251 case MCOI::OPERAND_REGISTER: 4252 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) { 4253 ErrInfo = "Illegal immediate value for operand."; 4254 return false; 4255 } 4256 break; 4257 case AMDGPU::OPERAND_REG_IMM_INT32: 4258 case AMDGPU::OPERAND_REG_IMM_FP32: 4259 case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: 4260 case AMDGPU::OPERAND_REG_IMM_V2FP32: 4261 break; 4262 case AMDGPU::OPERAND_REG_INLINE_C_INT32: 4263 case AMDGPU::OPERAND_REG_INLINE_C_FP32: 4264 case AMDGPU::OPERAND_REG_INLINE_C_INT64: 4265 case AMDGPU::OPERAND_REG_INLINE_C_FP64: 4266 case AMDGPU::OPERAND_REG_INLINE_C_INT16: 4267 case AMDGPU::OPERAND_REG_INLINE_C_FP16: 4268 case AMDGPU::OPERAND_REG_INLINE_AC_INT32: 4269 case AMDGPU::OPERAND_REG_INLINE_AC_FP32: 4270 case AMDGPU::OPERAND_REG_INLINE_AC_INT16: 4271 case AMDGPU::OPERAND_REG_INLINE_AC_FP16: 4272 case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { 4273 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { 4274 ErrInfo = "Illegal immediate value for operand."; 4275 return false; 4276 } 4277 break; 4278 } 4279 case MCOI::OPERAND_IMMEDIATE: 4280 case AMDGPU::OPERAND_KIMM32: 4281 // Check if this operand is an immediate. 4282 // FrameIndex operands will be replaced by immediates, so they are 4283 // allowed. 4284 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { 4285 ErrInfo = "Expected immediate, but got non-immediate"; 4286 return false; 4287 } 4288 [[fallthrough]]; 4289 default: 4290 continue; 4291 } 4292 4293 if (!MO.isReg()) 4294 continue; 4295 Register Reg = MO.getReg(); 4296 if (!Reg) 4297 continue; 4298 4299 // FIXME: Ideally we would have separate instruction definitions with the 4300 // aligned register constraint. 4301 // FIXME: We do not verify inline asm operands, but custom inline asm 4302 // verification is broken anyway 4303 if (ST.needsAlignedVGPRs()) { 4304 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg); 4305 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) { 4306 const TargetRegisterClass *SubRC = 4307 RI.getSubRegisterClass(RC, MO.getSubReg()); 4308 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg()); 4309 if (RC) 4310 RC = SubRC; 4311 } 4312 4313 // Check that this is the aligned version of the class. 4314 if (!RC || !RI.isProperlyAlignedRC(*RC)) { 4315 ErrInfo = "Subtarget requires even aligned vector registers"; 4316 return false; 4317 } 4318 } 4319 4320 if (RegClass != -1) { 4321 if (Reg.isVirtual()) 4322 continue; 4323 4324 const TargetRegisterClass *RC = RI.getRegClass(RegClass); 4325 if (!RC->contains(Reg)) { 4326 ErrInfo = "Operand has incorrect register class."; 4327 return false; 4328 } 4329 } 4330 } 4331 4332 // Verify SDWA 4333 if (isSDWA(MI)) { 4334 if (!ST.hasSDWA()) { 4335 ErrInfo = "SDWA is not supported on this target"; 4336 return false; 4337 } 4338 4339 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 4340 4341 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) { 4342 if (OpIdx == -1) 4343 continue; 4344 const MachineOperand &MO = MI.getOperand(OpIdx); 4345 4346 if (!ST.hasSDWAScalar()) { 4347 // Only VGPRS on VI 4348 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) { 4349 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI"; 4350 return false; 4351 } 4352 } else { 4353 // No immediates on GFX9 4354 if (!MO.isReg()) { 4355 ErrInfo = 4356 "Only reg allowed as operands in SDWA instructions on GFX9+"; 4357 return false; 4358 } 4359 } 4360 } 4361 4362 if (!ST.hasSDWAOmod()) { 4363 // No omod allowed on VI 4364 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 4365 if (OMod != nullptr && 4366 (!OMod->isImm() || OMod->getImm() != 0)) { 4367 ErrInfo = "OMod not allowed in SDWA instructions on VI"; 4368 return false; 4369 } 4370 } 4371 4372 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode); 4373 if (isVOPC(BasicOpcode)) { 4374 if (!ST.hasSDWASdst() && DstIdx != -1) { 4375 // Only vcc allowed as dst on VI for VOPC 4376 const MachineOperand &Dst = MI.getOperand(DstIdx); 4377 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) { 4378 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI"; 4379 return false; 4380 } 4381 } else if (!ST.hasSDWAOutModsVOPC()) { 4382 // No clamp allowed on GFX9 for VOPC 4383 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); 4384 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) { 4385 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI"; 4386 return false; 4387 } 4388 4389 // No omod allowed on GFX9 for VOPC 4390 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod); 4391 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) { 4392 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI"; 4393 return false; 4394 } 4395 } 4396 } 4397 4398 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); 4399 if (DstUnused && DstUnused->isImm() && 4400 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { 4401 const MachineOperand &Dst = MI.getOperand(DstIdx); 4402 if (!Dst.isReg() || !Dst.isTied()) { 4403 ErrInfo = "Dst register should have tied register"; 4404 return false; 4405 } 4406 4407 const MachineOperand &TiedMO = 4408 MI.getOperand(MI.findTiedOperandIdx(DstIdx)); 4409 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { 4410 ErrInfo = 4411 "Dst register should be tied to implicit use of preserved register"; 4412 return false; 4413 } else if (TiedMO.getReg().isPhysical() && 4414 Dst.getReg() != TiedMO.getReg()) { 4415 ErrInfo = "Dst register should use same physical register as preserved"; 4416 return false; 4417 } 4418 } 4419 } 4420 4421 // Verify MIMG 4422 if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { 4423 // Ensure that the return type used is large enough for all the options 4424 // being used TFE/LWE require an extra result register. 4425 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); 4426 if (DMask) { 4427 uint64_t DMaskImm = DMask->getImm(); 4428 uint32_t RegCount = 4429 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm); 4430 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); 4431 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); 4432 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); 4433 4434 // Adjust for packed 16 bit values 4435 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) 4436 RegCount = divideCeil(RegCount, 2); 4437 4438 // Adjust if using LWE or TFE 4439 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) 4440 RegCount += 1; 4441 4442 const uint32_t DstIdx = 4443 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); 4444 const MachineOperand &Dst = MI.getOperand(DstIdx); 4445 if (Dst.isReg()) { 4446 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); 4447 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; 4448 if (RegCount > DstSize) { 4449 ErrInfo = "Image instruction returns too many registers for dst " 4450 "register class"; 4451 return false; 4452 } 4453 } 4454 } 4455 } 4456 4457 // Verify VOP*. Ignore multiple sgpr operands on writelane. 4458 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) { 4459 unsigned ConstantBusCount = 0; 4460 bool UsesLiteral = false; 4461 const MachineOperand *LiteralVal = nullptr; 4462 4463 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm); 4464 if (ImmIdx != -1) { 4465 ++ConstantBusCount; 4466 UsesLiteral = true; 4467 LiteralVal = &MI.getOperand(ImmIdx); 4468 } 4469 4470 SmallVector<Register, 2> SGPRsUsed; 4471 Register SGPRUsed; 4472 4473 // Only look at the true operands. Only a real operand can use the constant 4474 // bus, and we don't want to check pseudo-operands like the source modifier 4475 // flags. 4476 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { 4477 if (OpIdx == -1) 4478 continue; 4479 const MachineOperand &MO = MI.getOperand(OpIdx); 4480 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) { 4481 if (MO.isReg()) { 4482 SGPRUsed = MO.getReg(); 4483 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) { 4484 ++ConstantBusCount; 4485 SGPRsUsed.push_back(SGPRUsed); 4486 } 4487 } else { 4488 if (!UsesLiteral) { 4489 ++ConstantBusCount; 4490 UsesLiteral = true; 4491 LiteralVal = &MO; 4492 } else if (!MO.isIdenticalTo(*LiteralVal)) { 4493 assert(isVOP2(MI) || isVOP3(MI)); 4494 ErrInfo = "VOP2/VOP3 instruction uses more than one literal"; 4495 return false; 4496 } 4497 } 4498 } 4499 } 4500 4501 SGPRUsed = findImplicitSGPRRead(MI); 4502 if (SGPRUsed) { 4503 // Implicit uses may safely overlap true operands 4504 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { 4505 return !RI.regsOverlap(SGPRUsed, SGPR); 4506 })) { 4507 ++ConstantBusCount; 4508 SGPRsUsed.push_back(SGPRUsed); 4509 } 4510 } 4511 4512 // v_writelane_b32 is an exception from constant bus restriction: 4513 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const 4514 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && 4515 Opcode != AMDGPU::V_WRITELANE_B32) { 4516 ErrInfo = "VOP* instruction violates constant bus restriction"; 4517 return false; 4518 } 4519 4520 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) { 4521 ErrInfo = "VOP3 instruction uses literal"; 4522 return false; 4523 } 4524 } 4525 4526 // Special case for writelane - this can break the multiple constant bus rule, 4527 // but still can't use more than one SGPR register 4528 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) { 4529 unsigned SGPRCount = 0; 4530 Register SGPRUsed; 4531 4532 for (int OpIdx : {Src0Idx, Src1Idx}) { 4533 if (OpIdx == -1) 4534 break; 4535 4536 const MachineOperand &MO = MI.getOperand(OpIdx); 4537 4538 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) { 4539 if (MO.isReg() && MO.getReg() != AMDGPU::M0) { 4540 if (MO.getReg() != SGPRUsed) 4541 ++SGPRCount; 4542 SGPRUsed = MO.getReg(); 4543 } 4544 } 4545 if (SGPRCount > ST.getConstantBusLimit(Opcode)) { 4546 ErrInfo = "WRITELANE instruction violates constant bus restriction"; 4547 return false; 4548 } 4549 } 4550 } 4551 4552 // Verify misc. restrictions on specific instructions. 4553 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || 4554 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { 4555 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4556 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 4557 const MachineOperand &Src2 = MI.getOperand(Src2Idx); 4558 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { 4559 if (!compareMachineOp(Src0, Src1) && 4560 !compareMachineOp(Src0, Src2)) { 4561 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2"; 4562 return false; 4563 } 4564 } 4565 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & 4566 SISrcMods::ABS) || 4567 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() & 4568 SISrcMods::ABS) || 4569 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & 4570 SISrcMods::ABS)) { 4571 ErrInfo = "ABS not allowed in VOP3B instructions"; 4572 return false; 4573 } 4574 } 4575 4576 if (isSOP2(MI) || isSOPC(MI)) { 4577 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4578 const MachineOperand &Src1 = MI.getOperand(Src1Idx); 4579 4580 if (!Src0.isReg() && !Src1.isReg() && 4581 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) && 4582 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) && 4583 !Src0.isIdenticalTo(Src1)) { 4584 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants"; 4585 return false; 4586 } 4587 } 4588 4589 if (isSOPK(MI)) { 4590 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16); 4591 if (Desc.isBranch()) { 4592 if (!Op->isMBB()) { 4593 ErrInfo = "invalid branch target for SOPK instruction"; 4594 return false; 4595 } 4596 } else { 4597 uint64_t Imm = Op->getImm(); 4598 if (sopkIsZext(MI)) { 4599 if (!isUInt<16>(Imm)) { 4600 ErrInfo = "invalid immediate for SOPK instruction"; 4601 return false; 4602 } 4603 } else { 4604 if (!isInt<16>(Imm)) { 4605 ErrInfo = "invalid immediate for SOPK instruction"; 4606 return false; 4607 } 4608 } 4609 } 4610 } 4611 4612 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || 4613 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || 4614 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 4615 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { 4616 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || 4617 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; 4618 4619 const unsigned StaticNumOps = 4620 Desc.getNumOperands() + Desc.implicit_uses().size(); 4621 const unsigned NumImplicitOps = IsDst ? 2 : 1; 4622 4623 // Allow additional implicit operands. This allows a fixup done by the post 4624 // RA scheduler where the main implicit operand is killed and implicit-defs 4625 // are added for sub-registers that remain live after this instruction. 4626 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { 4627 ErrInfo = "missing implicit register operands"; 4628 return false; 4629 } 4630 4631 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 4632 if (IsDst) { 4633 if (!Dst->isUse()) { 4634 ErrInfo = "v_movreld_b32 vdst should be a use operand"; 4635 return false; 4636 } 4637 4638 unsigned UseOpIdx; 4639 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || 4640 UseOpIdx != StaticNumOps + 1) { 4641 ErrInfo = "movrel implicit operands should be tied"; 4642 return false; 4643 } 4644 } 4645 4646 const MachineOperand &Src0 = MI.getOperand(Src0Idx); 4647 const MachineOperand &ImpUse 4648 = MI.getOperand(StaticNumOps + NumImplicitOps - 1); 4649 if (!ImpUse.isReg() || !ImpUse.isUse() || 4650 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { 4651 ErrInfo = "src0 should be subreg of implicit vector use"; 4652 return false; 4653 } 4654 } 4655 4656 // Make sure we aren't losing exec uses in the td files. This mostly requires 4657 // being careful when using let Uses to try to add other use registers. 4658 if (shouldReadExec(MI)) { 4659 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { 4660 ErrInfo = "VALU instruction does not implicitly read exec mask"; 4661 return false; 4662 } 4663 } 4664 4665 if (isSMRD(MI)) { 4666 if (MI.mayStore() && 4667 ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { 4668 // The register offset form of scalar stores may only use m0 as the 4669 // soffset register. 4670 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset); 4671 if (Soff && Soff->getReg() != AMDGPU::M0) { 4672 ErrInfo = "scalar stores must use m0 as offset register"; 4673 return false; 4674 } 4675 } 4676 } 4677 4678 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) { 4679 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 4680 if (Offset->getImm() != 0) { 4681 ErrInfo = "subtarget does not support offsets in flat instructions"; 4682 return false; 4683 } 4684 } 4685 4686 if (isMIMG(MI)) { 4687 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); 4688 if (DimOp) { 4689 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode, 4690 AMDGPU::OpName::vaddr0); 4691 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc); 4692 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode); 4693 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4694 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode); 4695 const AMDGPU::MIMGDimInfo *Dim = 4696 AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm()); 4697 4698 if (!Dim) { 4699 ErrInfo = "dim is out of range"; 4700 return false; 4701 } 4702 4703 bool IsA16 = false; 4704 if (ST.hasR128A16()) { 4705 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); 4706 IsA16 = R128A16->getImm() != 0; 4707 } else if (ST.hasA16()) { 4708 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); 4709 IsA16 = A16->getImm() != 0; 4710 } 4711 4712 bool IsNSA = SRsrcIdx - VAddr0Idx > 1; 4713 4714 unsigned AddrWords = 4715 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16()); 4716 4717 unsigned VAddrWords; 4718 if (IsNSA) { 4719 VAddrWords = SRsrcIdx - VAddr0Idx; 4720 if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) { 4721 unsigned LastVAddrIdx = SRsrcIdx - 1; 4722 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1; 4723 } 4724 } else { 4725 VAddrWords = getOpSize(MI, VAddr0Idx) / 4; 4726 if (AddrWords > 12) 4727 AddrWords = 16; 4728 } 4729 4730 if (VAddrWords != AddrWords) { 4731 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords 4732 << " but got " << VAddrWords << "\n"); 4733 ErrInfo = "bad vaddr size"; 4734 return false; 4735 } 4736 } 4737 } 4738 4739 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); 4740 if (DppCt) { 4741 using namespace AMDGPU::DPP; 4742 4743 unsigned DC = DppCt->getImm(); 4744 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || 4745 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || 4746 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || 4747 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || 4748 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || 4749 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) || 4750 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) { 4751 ErrInfo = "Invalid dpp_ctrl value"; 4752 return false; 4753 } 4754 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 && 4755 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4756 ErrInfo = "Invalid dpp_ctrl value: " 4757 "wavefront shifts are not supported on GFX10+"; 4758 return false; 4759 } 4760 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 && 4761 ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 4762 ErrInfo = "Invalid dpp_ctrl value: " 4763 "broadcasts are not supported on GFX10+"; 4764 return false; 4765 } 4766 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && 4767 ST.getGeneration() < AMDGPUSubtarget::GFX10) { 4768 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && 4769 DC <= DppCtrl::ROW_NEWBCAST_LAST && 4770 !ST.hasGFX90AInsts()) { 4771 ErrInfo = "Invalid dpp_ctrl value: " 4772 "row_newbroadcast/row_share is not supported before " 4773 "GFX90A/GFX10"; 4774 return false; 4775 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { 4776 ErrInfo = "Invalid dpp_ctrl value: " 4777 "row_share and row_xmask are not supported before GFX10"; 4778 return false; 4779 } 4780 } 4781 4782 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); 4783 4784 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && 4785 ((DstIdx >= 0 && 4786 (Desc.operands()[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || 4787 Desc.operands()[DstIdx].RegClass == 4788 AMDGPU::VReg_64_Align2RegClassID)) || 4789 ((Src0Idx >= 0 && 4790 (Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || 4791 Desc.operands()[Src0Idx].RegClass == 4792 AMDGPU::VReg_64_Align2RegClassID)))) && 4793 !AMDGPU::isLegal64BitDPPControl(DC)) { 4794 ErrInfo = "Invalid dpp_ctrl value: " 4795 "64 bit dpp only support row_newbcast"; 4796 return false; 4797 } 4798 } 4799 4800 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { 4801 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); 4802 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 4803 : AMDGPU::OpName::vdata; 4804 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); 4805 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); 4806 if (Data && !Data->isReg()) 4807 Data = nullptr; 4808 4809 if (ST.hasGFX90AInsts()) { 4810 if (Dst && Data && 4811 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { 4812 ErrInfo = "Invalid register class: " 4813 "vdata and vdst should be both VGPR or AGPR"; 4814 return false; 4815 } 4816 if (Data && Data2 && 4817 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { 4818 ErrInfo = "Invalid register class: " 4819 "both data operands should be VGPR or AGPR"; 4820 return false; 4821 } 4822 } else { 4823 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || 4824 (Data && RI.isAGPR(MRI, Data->getReg())) || 4825 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { 4826 ErrInfo = "Invalid register class: " 4827 "agpr loads and stores not supported on this GPU"; 4828 return false; 4829 } 4830 } 4831 } 4832 4833 if (ST.needsAlignedVGPRs()) { 4834 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool { 4835 const MachineOperand *Op = getNamedOperand(MI, OpName); 4836 if (!Op) 4837 return true; 4838 Register Reg = Op->getReg(); 4839 if (Reg.isPhysical()) 4840 return !(RI.getHWRegIndex(Reg) & 1); 4841 const TargetRegisterClass &RC = *MRI.getRegClass(Reg); 4842 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) && 4843 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1); 4844 }; 4845 4846 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT || 4847 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || 4848 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) { 4849 4850 if (!isAlignedReg(AMDGPU::OpName::data0)) { 4851 ErrInfo = "Subtarget requires even aligned vector registers " 4852 "for DS_GWS instructions"; 4853 return false; 4854 } 4855 } 4856 4857 if (isMIMG(MI)) { 4858 if (!isAlignedReg(AMDGPU::OpName::vaddr)) { 4859 ErrInfo = "Subtarget requires even aligned vector registers " 4860 "for vaddr operand of image instructions"; 4861 return false; 4862 } 4863 } 4864 } 4865 4866 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && 4867 !ST.hasGFX90AInsts()) { 4868 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0); 4869 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) { 4870 ErrInfo = "Invalid register class: " 4871 "v_accvgpr_write with an SGPR is not supported on this GPU"; 4872 return false; 4873 } 4874 } 4875 4876 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) { 4877 const MachineOperand &SrcOp = MI.getOperand(1); 4878 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) { 4879 ErrInfo = "pseudo expects only physical SGPRs"; 4880 return false; 4881 } 4882 } 4883 4884 return true; 4885 } 4886 4887 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { 4888 switch (MI.getOpcode()) { 4889 default: return AMDGPU::INSTRUCTION_LIST_END; 4890 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; 4891 case AMDGPU::COPY: return AMDGPU::COPY; 4892 case AMDGPU::PHI: return AMDGPU::PHI; 4893 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; 4894 case AMDGPU::WQM: return AMDGPU::WQM; 4895 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; 4896 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; 4897 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; 4898 case AMDGPU::S_MOV_B32: { 4899 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 4900 return MI.getOperand(1).isReg() || 4901 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? 4902 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; 4903 } 4904 case AMDGPU::S_ADD_I32: 4905 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; 4906 case AMDGPU::S_ADDC_U32: 4907 return AMDGPU::V_ADDC_U32_e32; 4908 case AMDGPU::S_SUB_I32: 4909 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32; 4910 // FIXME: These are not consistently handled, and selected when the carry is 4911 // used. 4912 case AMDGPU::S_ADD_U32: 4913 return AMDGPU::V_ADD_CO_U32_e32; 4914 case AMDGPU::S_SUB_U32: 4915 return AMDGPU::V_SUB_CO_U32_e32; 4916 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; 4917 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64; 4918 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64; 4919 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64; 4920 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; 4921 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; 4922 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; 4923 case AMDGPU::S_XNOR_B32: 4924 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; 4925 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; 4926 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; 4927 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; 4928 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; 4929 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; 4930 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64; 4931 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; 4932 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64; 4933 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; 4934 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64; 4935 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64; 4936 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64; 4937 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64; 4938 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64; 4939 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64; 4940 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; 4941 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; 4942 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; 4943 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64; 4944 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64; 4945 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64; 4946 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64; 4947 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64; 4948 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64; 4949 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64; 4950 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64; 4951 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64; 4952 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64; 4953 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64; 4954 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64; 4955 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64; 4956 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64; 4957 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; 4958 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; 4959 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; 4960 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; 4961 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; 4962 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; 4963 } 4964 llvm_unreachable( 4965 "Unexpected scalar opcode without corresponding vector one!"); 4966 } 4967 4968 void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF, 4969 MachineBasicBlock &MBB, 4970 MachineBasicBlock::iterator MBBI, 4971 const DebugLoc &DL, Register Reg, 4972 bool IsSCCLive) const { 4973 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4974 const SIInstrInfo *TII = ST.getInstrInfo(); 4975 bool IsWave32 = ST.isWave32(); 4976 if (IsSCCLive) { 4977 // Insert two move instructions, one to save the original value of EXEC and 4978 // the other to turn on all bits in EXEC. This is required as we can't use 4979 // the single instruction S_OR_SAVEEXEC that clobbers SCC. 4980 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4981 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4982 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); 4983 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); 4984 } else { 4985 const unsigned OrSaveExec = 4986 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; 4987 auto SaveExec = 4988 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); 4989 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. 4990 } 4991 } 4992 4993 void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, 4994 MachineBasicBlock::iterator MBBI, 4995 const DebugLoc &DL, Register Reg) const { 4996 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 4997 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 4998 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); 4999 } 5000 5001 static const TargetRegisterClass * 5002 adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, 5003 const MachineRegisterInfo &MRI, 5004 const MCInstrDesc &TID, unsigned RCID, 5005 bool IsAllocatable) { 5006 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 5007 (((TID.mayLoad() || TID.mayStore()) && 5008 !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || 5009 (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { 5010 switch (RCID) { 5011 case AMDGPU::AV_32RegClassID: 5012 RCID = AMDGPU::VGPR_32RegClassID; 5013 break; 5014 case AMDGPU::AV_64RegClassID: 5015 RCID = AMDGPU::VReg_64RegClassID; 5016 break; 5017 case AMDGPU::AV_96RegClassID: 5018 RCID = AMDGPU::VReg_96RegClassID; 5019 break; 5020 case AMDGPU::AV_128RegClassID: 5021 RCID = AMDGPU::VReg_128RegClassID; 5022 break; 5023 case AMDGPU::AV_160RegClassID: 5024 RCID = AMDGPU::VReg_160RegClassID; 5025 break; 5026 case AMDGPU::AV_512RegClassID: 5027 RCID = AMDGPU::VReg_512RegClassID; 5028 break; 5029 default: 5030 break; 5031 } 5032 } 5033 5034 return RI.getProperlyAlignedRC(RI.getRegClass(RCID)); 5035 } 5036 5037 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, 5038 unsigned OpNum, const TargetRegisterInfo *TRI, 5039 const MachineFunction &MF) 5040 const { 5041 if (OpNum >= TID.getNumOperands()) 5042 return nullptr; 5043 auto RegClass = TID.operands()[OpNum].RegClass; 5044 bool IsAllocatable = false; 5045 if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { 5046 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions 5047 // with two data operands. Request register class constrained to VGPR only 5048 // of both operands present as Machine Copy Propagation can not check this 5049 // constraint and possibly other passes too. 5050 // 5051 // The check is limited to FLAT and DS because atomics in non-flat encoding 5052 // have their vdst and vdata tied to be the same register. 5053 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 5054 AMDGPU::OpName::vdst); 5055 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, 5056 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 5057 : AMDGPU::OpName::vdata); 5058 if (DataIdx != -1) { 5059 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand( 5060 TID.Opcode, AMDGPU::OpName::data1); 5061 } 5062 } 5063 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass, 5064 IsAllocatable); 5065 } 5066 5067 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, 5068 unsigned OpNo) const { 5069 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 5070 const MCInstrDesc &Desc = get(MI.getOpcode()); 5071 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() || 5072 Desc.operands()[OpNo].RegClass == -1) { 5073 Register Reg = MI.getOperand(OpNo).getReg(); 5074 5075 if (Reg.isVirtual()) 5076 return MRI.getRegClass(Reg); 5077 return RI.getPhysRegBaseClass(Reg); 5078 } 5079 5080 unsigned RCID = Desc.operands()[OpNo].RegClass; 5081 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true); 5082 } 5083 5084 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { 5085 MachineBasicBlock::iterator I = MI; 5086 MachineBasicBlock *MBB = MI.getParent(); 5087 MachineOperand &MO = MI.getOperand(OpIdx); 5088 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 5089 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass; 5090 const TargetRegisterClass *RC = RI.getRegClass(RCID); 5091 unsigned Size = RI.getRegSizeInBits(*RC); 5092 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32; 5093 if (MO.isReg()) 5094 Opcode = AMDGPU::COPY; 5095 else if (RI.isSGPRClass(RC)) 5096 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; 5097 5098 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); 5099 Register Reg = MRI.createVirtualRegister(VRC); 5100 DebugLoc DL = MBB->findDebugLoc(I); 5101 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); 5102 MO.ChangeToRegister(Reg, false); 5103 } 5104 5105 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, 5106 MachineRegisterInfo &MRI, 5107 MachineOperand &SuperReg, 5108 const TargetRegisterClass *SuperRC, 5109 unsigned SubIdx, 5110 const TargetRegisterClass *SubRC) 5111 const { 5112 MachineBasicBlock *MBB = MI->getParent(); 5113 DebugLoc DL = MI->getDebugLoc(); 5114 Register SubReg = MRI.createVirtualRegister(SubRC); 5115 5116 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { 5117 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 5118 .addReg(SuperReg.getReg(), 0, SubIdx); 5119 return SubReg; 5120 } 5121 5122 // Just in case the super register is itself a sub-register, copy it to a new 5123 // value so we don't need to worry about merging its subreg index with the 5124 // SubIdx passed to this function. The register coalescer should be able to 5125 // eliminate this extra copy. 5126 Register NewSuperReg = MRI.createVirtualRegister(SuperRC); 5127 5128 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) 5129 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); 5130 5131 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) 5132 .addReg(NewSuperReg, 0, SubIdx); 5133 5134 return SubReg; 5135 } 5136 5137 MachineOperand SIInstrInfo::buildExtractSubRegOrImm( 5138 MachineBasicBlock::iterator MII, 5139 MachineRegisterInfo &MRI, 5140 MachineOperand &Op, 5141 const TargetRegisterClass *SuperRC, 5142 unsigned SubIdx, 5143 const TargetRegisterClass *SubRC) const { 5144 if (Op.isImm()) { 5145 if (SubIdx == AMDGPU::sub0) 5146 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); 5147 if (SubIdx == AMDGPU::sub1) 5148 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); 5149 5150 llvm_unreachable("Unhandled register index for immediate"); 5151 } 5152 5153 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC, 5154 SubIdx, SubRC); 5155 return MachineOperand::CreateReg(SubReg, false); 5156 } 5157 5158 // Change the order of operands from (0, 1, 2) to (0, 2, 1) 5159 void SIInstrInfo::swapOperands(MachineInstr &Inst) const { 5160 assert(Inst.getNumExplicitOperands() == 3); 5161 MachineOperand Op1 = Inst.getOperand(1); 5162 Inst.removeOperand(1); 5163 Inst.addOperand(Op1); 5164 } 5165 5166 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, 5167 const MCOperandInfo &OpInfo, 5168 const MachineOperand &MO) const { 5169 if (!MO.isReg()) 5170 return false; 5171 5172 Register Reg = MO.getReg(); 5173 5174 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass); 5175 if (Reg.isPhysical()) 5176 return DRC->contains(Reg); 5177 5178 const TargetRegisterClass *RC = MRI.getRegClass(Reg); 5179 5180 if (MO.getSubReg()) { 5181 const MachineFunction *MF = MO.getParent()->getParent()->getParent(); 5182 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF); 5183 if (!SuperRC) 5184 return false; 5185 5186 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()); 5187 if (!DRC) 5188 return false; 5189 } 5190 return RC->hasSuperClassEq(DRC); 5191 } 5192 5193 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, 5194 const MCOperandInfo &OpInfo, 5195 const MachineOperand &MO) const { 5196 if (MO.isReg()) 5197 return isLegalRegOperand(MRI, OpInfo, MO); 5198 5199 // Handle non-register types that are treated like immediates. 5200 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal()); 5201 return true; 5202 } 5203 5204 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, 5205 const MachineOperand *MO) const { 5206 const MachineFunction &MF = *MI.getParent()->getParent(); 5207 const MachineRegisterInfo &MRI = MF.getRegInfo(); 5208 const MCInstrDesc &InstDesc = MI.getDesc(); 5209 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx]; 5210 const TargetRegisterClass *DefinedRC = 5211 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; 5212 if (!MO) 5213 MO = &MI.getOperand(OpIdx); 5214 5215 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); 5216 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; 5217 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { 5218 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--) 5219 return false; 5220 5221 SmallDenseSet<RegSubRegPair> SGPRsUsed; 5222 if (MO->isReg()) 5223 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); 5224 5225 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 5226 if (i == OpIdx) 5227 continue; 5228 const MachineOperand &Op = MI.getOperand(i); 5229 if (Op.isReg()) { 5230 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); 5231 if (!SGPRsUsed.count(SGPR) && 5232 // FIXME: This can access off the end of the operands() array. 5233 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) { 5234 if (--ConstantBusLimit <= 0) 5235 return false; 5236 SGPRsUsed.insert(SGPR); 5237 } 5238 } else if (InstDesc.operands()[i].OperandType == AMDGPU::OPERAND_KIMM32 || 5239 (AMDGPU::isSISrcOperand(InstDesc, i) && 5240 !isInlineConstant(Op, InstDesc.operands()[i]))) { 5241 if (!LiteralLimit--) 5242 return false; 5243 if (--ConstantBusLimit <= 0) 5244 return false; 5245 } 5246 } 5247 } 5248 5249 if (MO->isReg()) { 5250 if (!DefinedRC) 5251 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN; 5252 if (!isLegalRegOperand(MRI, OpInfo, *MO)) 5253 return false; 5254 bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); 5255 if (IsAGPR && !ST.hasMAIInsts()) 5256 return false; 5257 unsigned Opc = MI.getOpcode(); 5258 if (IsAGPR && 5259 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && 5260 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) 5261 return false; 5262 // Atomics should have both vdst and vdata either vgpr or agpr. 5263 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); 5264 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, 5265 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); 5266 if ((int)OpIdx == VDstIdx && DataIdx != -1 && 5267 MI.getOperand(DataIdx).isReg() && 5268 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) 5269 return false; 5270 if ((int)OpIdx == DataIdx) { 5271 if (VDstIdx != -1 && 5272 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) 5273 return false; 5274 // DS instructions with 2 src operands also must have tied RC. 5275 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, 5276 AMDGPU::OpName::data1); 5277 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && 5278 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) 5279 return false; 5280 } 5281 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() && 5282 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && 5283 RI.isSGPRReg(MRI, MO->getReg())) 5284 return false; 5285 return true; 5286 } 5287 5288 // Handle non-register types that are treated like immediates. 5289 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); 5290 5291 if (!DefinedRC) { 5292 // This operand expects an immediate. 5293 return true; 5294 } 5295 5296 return isImmOperandLegal(MI, OpIdx, *MO); 5297 } 5298 5299 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, 5300 MachineInstr &MI) const { 5301 unsigned Opc = MI.getOpcode(); 5302 const MCInstrDesc &InstrDesc = get(Opc); 5303 5304 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); 5305 MachineOperand &Src0 = MI.getOperand(Src0Idx); 5306 5307 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); 5308 MachineOperand &Src1 = MI.getOperand(Src1Idx); 5309 5310 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 5311 // we need to only have one constant bus use before GFX10. 5312 bool HasImplicitSGPR = findImplicitSGPRRead(MI); 5313 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() && 5314 RI.isSGPRReg(MRI, Src0.getReg())) 5315 legalizeOpWithMove(MI, Src0Idx); 5316 5317 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for 5318 // both the value to write (src0) and lane select (src1). Fix up non-SGPR 5319 // src0/src1 with V_READFIRSTLANE. 5320 if (Opc == AMDGPU::V_WRITELANE_B32) { 5321 const DebugLoc &DL = MI.getDebugLoc(); 5322 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { 5323 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5324 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5325 .add(Src0); 5326 Src0.ChangeToRegister(Reg, false); 5327 } 5328 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { 5329 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5330 const DebugLoc &DL = MI.getDebugLoc(); 5331 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5332 .add(Src1); 5333 Src1.ChangeToRegister(Reg, false); 5334 } 5335 return; 5336 } 5337 5338 // No VOP2 instructions support AGPRs. 5339 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg())) 5340 legalizeOpWithMove(MI, Src0Idx); 5341 5342 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) 5343 legalizeOpWithMove(MI, Src1Idx); 5344 5345 // VOP2 src0 instructions support all operand types, so we don't need to check 5346 // their legality. If src1 is already legal, we don't need to do anything. 5347 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1)) 5348 return; 5349 5350 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for 5351 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane 5352 // select is uniform. 5353 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() && 5354 RI.isVGPR(MRI, Src1.getReg())) { 5355 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5356 const DebugLoc &DL = MI.getDebugLoc(); 5357 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5358 .add(Src1); 5359 Src1.ChangeToRegister(Reg, false); 5360 return; 5361 } 5362 5363 // We do not use commuteInstruction here because it is too aggressive and will 5364 // commute if it is possible. We only want to commute here if it improves 5365 // legality. This can be called a fairly large number of times so don't waste 5366 // compile time pointlessly swapping and checking legality again. 5367 if (HasImplicitSGPR || !MI.isCommutable()) { 5368 legalizeOpWithMove(MI, Src1Idx); 5369 return; 5370 } 5371 5372 // If src0 can be used as src1, commuting will make the operands legal. 5373 // Otherwise we have to give up and insert a move. 5374 // 5375 // TODO: Other immediate-like operand kinds could be commuted if there was a 5376 // MachineOperand::ChangeTo* for them. 5377 if ((!Src1.isImm() && !Src1.isReg()) || 5378 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) { 5379 legalizeOpWithMove(MI, Src1Idx); 5380 return; 5381 } 5382 5383 int CommutedOpc = commuteOpcode(MI); 5384 if (CommutedOpc == -1) { 5385 legalizeOpWithMove(MI, Src1Idx); 5386 return; 5387 } 5388 5389 MI.setDesc(get(CommutedOpc)); 5390 5391 Register Src0Reg = Src0.getReg(); 5392 unsigned Src0SubReg = Src0.getSubReg(); 5393 bool Src0Kill = Src0.isKill(); 5394 5395 if (Src1.isImm()) 5396 Src0.ChangeToImmediate(Src1.getImm()); 5397 else if (Src1.isReg()) { 5398 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); 5399 Src0.setSubReg(Src1.getSubReg()); 5400 } else 5401 llvm_unreachable("Should only have register or immediate operands"); 5402 5403 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); 5404 Src1.setSubReg(Src0SubReg); 5405 fixImplicitOperands(MI); 5406 } 5407 5408 // Legalize VOP3 operands. All operand types are supported for any operand 5409 // but only one literal constant and only starting from GFX10. 5410 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, 5411 MachineInstr &MI) const { 5412 unsigned Opc = MI.getOpcode(); 5413 5414 int VOP3Idx[3] = { 5415 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), 5416 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), 5417 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) 5418 }; 5419 5420 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 || 5421 Opc == AMDGPU::V_PERMLANEX16_B32_e64) { 5422 // src1 and src2 must be scalar 5423 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]); 5424 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]); 5425 const DebugLoc &DL = MI.getDebugLoc(); 5426 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) { 5427 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5428 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5429 .add(Src1); 5430 Src1.ChangeToRegister(Reg, false); 5431 } 5432 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) { 5433 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 5434 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) 5435 .add(Src2); 5436 Src2.ChangeToRegister(Reg, false); 5437 } 5438 } 5439 5440 // Find the one SGPR operand we are allowed to use. 5441 int ConstantBusLimit = ST.getConstantBusLimit(Opc); 5442 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; 5443 SmallDenseSet<unsigned> SGPRsUsed; 5444 Register SGPRReg = findUsedSGPR(MI, VOP3Idx); 5445 if (SGPRReg) { 5446 SGPRsUsed.insert(SGPRReg); 5447 --ConstantBusLimit; 5448 } 5449 5450 for (int Idx : VOP3Idx) { 5451 if (Idx == -1) 5452 break; 5453 MachineOperand &MO = MI.getOperand(Idx); 5454 5455 if (!MO.isReg()) { 5456 if (isInlineConstant(MO, get(Opc).operands()[Idx])) 5457 continue; 5458 5459 if (LiteralLimit > 0 && ConstantBusLimit > 0) { 5460 --LiteralLimit; 5461 --ConstantBusLimit; 5462 continue; 5463 } 5464 5465 --LiteralLimit; 5466 --ConstantBusLimit; 5467 legalizeOpWithMove(MI, Idx); 5468 continue; 5469 } 5470 5471 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) && 5472 !isOperandLegal(MI, Idx, &MO)) { 5473 legalizeOpWithMove(MI, Idx); 5474 continue; 5475 } 5476 5477 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg()))) 5478 continue; // VGPRs are legal 5479 5480 // We can use one SGPR in each VOP3 instruction prior to GFX10 5481 // and two starting from GFX10. 5482 if (SGPRsUsed.count(MO.getReg())) 5483 continue; 5484 if (ConstantBusLimit > 0) { 5485 SGPRsUsed.insert(MO.getReg()); 5486 --ConstantBusLimit; 5487 continue; 5488 } 5489 5490 // If we make it this far, then the operand is not legal and we must 5491 // legalize it. 5492 legalizeOpWithMove(MI, Idx); 5493 } 5494 } 5495 5496 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, 5497 MachineRegisterInfo &MRI) const { 5498 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); 5499 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); 5500 Register DstReg = MRI.createVirtualRegister(SRC); 5501 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; 5502 5503 if (RI.hasAGPRs(VRC)) { 5504 VRC = RI.getEquivalentVGPRClass(VRC); 5505 Register NewSrcReg = MRI.createVirtualRegister(VRC); 5506 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5507 get(TargetOpcode::COPY), NewSrcReg) 5508 .addReg(SrcReg); 5509 SrcReg = NewSrcReg; 5510 } 5511 5512 if (SubRegs == 1) { 5513 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5514 get(AMDGPU::V_READFIRSTLANE_B32), DstReg) 5515 .addReg(SrcReg); 5516 return DstReg; 5517 } 5518 5519 SmallVector<Register, 8> SRegs; 5520 for (unsigned i = 0; i < SubRegs; ++i) { 5521 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5522 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5523 get(AMDGPU::V_READFIRSTLANE_B32), SGPR) 5524 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); 5525 SRegs.push_back(SGPR); 5526 } 5527 5528 MachineInstrBuilder MIB = 5529 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), 5530 get(AMDGPU::REG_SEQUENCE), DstReg); 5531 for (unsigned i = 0; i < SubRegs; ++i) { 5532 MIB.addReg(SRegs[i]); 5533 MIB.addImm(RI.getSubRegFromChannel(i)); 5534 } 5535 return DstReg; 5536 } 5537 5538 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, 5539 MachineInstr &MI) const { 5540 5541 // If the pointer is store in VGPRs, then we need to move them to 5542 // SGPRs using v_readfirstlane. This is safe because we only select 5543 // loads with uniform pointers to SMRD instruction so we know the 5544 // pointer value is uniform. 5545 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); 5546 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { 5547 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); 5548 SBase->setReg(SGPR); 5549 } 5550 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset); 5551 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { 5552 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); 5553 SOff->setReg(SGPR); 5554 } 5555 } 5556 5557 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { 5558 unsigned Opc = Inst.getOpcode(); 5559 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 5560 if (OldSAddrIdx < 0) 5561 return false; 5562 5563 assert(isSegmentSpecificFLAT(Inst)); 5564 5565 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); 5566 if (NewOpc < 0) 5567 NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc); 5568 if (NewOpc < 0) 5569 return false; 5570 5571 MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo(); 5572 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx); 5573 if (RI.isSGPRReg(MRI, SAddr.getReg())) 5574 return false; 5575 5576 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr); 5577 if (NewVAddrIdx < 0) 5578 return false; 5579 5580 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 5581 5582 // Check vaddr, it shall be zero or absent. 5583 MachineInstr *VAddrDef = nullptr; 5584 if (OldVAddrIdx >= 0) { 5585 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); 5586 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); 5587 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || 5588 !VAddrDef->getOperand(1).isImm() || 5589 VAddrDef->getOperand(1).getImm() != 0) 5590 return false; 5591 } 5592 5593 const MCInstrDesc &NewDesc = get(NewOpc); 5594 Inst.setDesc(NewDesc); 5595 5596 // Callers expect iterator to be valid after this call, so modify the 5597 // instruction in place. 5598 if (OldVAddrIdx == NewVAddrIdx) { 5599 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx); 5600 // Clear use list from the old vaddr holding a zero register. 5601 MRI.removeRegOperandFromUseList(&NewVAddr); 5602 MRI.moveOperands(&NewVAddr, &SAddr, 1); 5603 Inst.removeOperand(OldSAddrIdx); 5604 // Update the use list with the pointer we have just moved from vaddr to 5605 // saddr position. Otherwise new vaddr will be missing from the use list. 5606 MRI.removeRegOperandFromUseList(&NewVAddr); 5607 MRI.addRegOperandToUseList(&NewVAddr); 5608 } else { 5609 assert(OldSAddrIdx == NewVAddrIdx); 5610 5611 if (OldVAddrIdx >= 0) { 5612 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc, 5613 AMDGPU::OpName::vdst_in); 5614 5615 // removeOperand doesn't try to fixup tied operand indexes at it goes, so 5616 // it asserts. Untie the operands for now and retie them afterwards. 5617 if (NewVDstIn != -1) { 5618 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in); 5619 Inst.untieRegOperand(OldVDstIn); 5620 } 5621 5622 Inst.removeOperand(OldVAddrIdx); 5623 5624 if (NewVDstIn != -1) { 5625 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst); 5626 Inst.tieOperands(NewVDst, NewVDstIn); 5627 } 5628 } 5629 } 5630 5631 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) 5632 VAddrDef->eraseFromParent(); 5633 5634 return true; 5635 } 5636 5637 // FIXME: Remove this when SelectionDAG is obsoleted. 5638 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, 5639 MachineInstr &MI) const { 5640 if (!isSegmentSpecificFLAT(MI)) 5641 return; 5642 5643 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence 5644 // thinks they are uniform, so a readfirstlane should be valid. 5645 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); 5646 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) 5647 return; 5648 5649 if (moveFlatAddrToVGPR(MI)) 5650 return; 5651 5652 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); 5653 SAddr->setReg(ToSGPR); 5654 } 5655 5656 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, 5657 MachineBasicBlock::iterator I, 5658 const TargetRegisterClass *DstRC, 5659 MachineOperand &Op, 5660 MachineRegisterInfo &MRI, 5661 const DebugLoc &DL) const { 5662 Register OpReg = Op.getReg(); 5663 unsigned OpSubReg = Op.getSubReg(); 5664 5665 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( 5666 RI.getRegClassForReg(MRI, OpReg), OpSubReg); 5667 5668 // Check if operand is already the correct register class. 5669 if (DstRC == OpRC) 5670 return; 5671 5672 Register DstReg = MRI.createVirtualRegister(DstRC); 5673 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); 5674 5675 Op.setReg(DstReg); 5676 Op.setSubReg(0); 5677 5678 MachineInstr *Def = MRI.getVRegDef(OpReg); 5679 if (!Def) 5680 return; 5681 5682 // Try to eliminate the copy if it is copying an immediate value. 5683 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) 5684 FoldImmediate(*Copy, *Def, OpReg, &MRI); 5685 5686 bool ImpDef = Def->isImplicitDef(); 5687 while (!ImpDef && Def && Def->isCopy()) { 5688 if (Def->getOperand(1).getReg().isPhysical()) 5689 break; 5690 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); 5691 ImpDef = Def && Def->isImplicitDef(); 5692 } 5693 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) && 5694 !ImpDef) 5695 Copy.addReg(AMDGPU::EXEC, RegState::Implicit); 5696 } 5697 5698 // Emit the actual waterfall loop, executing the wrapped instruction for each 5699 // unique value of \p ScalarOps across all lanes. In the best case we execute 1 5700 // iteration, in the worst case we execute 64 (once per lane). 5701 static void emitLoadScalarOpsFromVGPRLoop( 5702 const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, 5703 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, 5704 ArrayRef<MachineOperand *> ScalarOps) { 5705 MachineFunction &MF = *OrigBB.getParent(); 5706 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 5707 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5708 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5709 unsigned SaveExecOpc = 5710 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 5711 unsigned XorTermOpc = 5712 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 5713 unsigned AndOpc = 5714 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 5715 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5716 5717 MachineBasicBlock::iterator I = LoopBB.begin(); 5718 5719 SmallVector<Register, 8> ReadlanePieces; 5720 Register CondReg; 5721 5722 for (MachineOperand *ScalarOp : ScalarOps) { 5723 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); 5724 unsigned NumSubRegs = RegSize / 32; 5725 Register VScalarOp = ScalarOp->getReg(); 5726 5727 if (NumSubRegs == 1) { 5728 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5729 5730 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg) 5731 .addReg(VScalarOp); 5732 5733 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); 5734 5735 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg) 5736 .addReg(CurReg) 5737 .addReg(VScalarOp); 5738 5739 // Combine the comparison results with AND. 5740 if (!CondReg) // First. 5741 CondReg = NewCondReg; 5742 else { // If not the first, we create an AND. 5743 Register AndReg = MRI.createVirtualRegister(BoolXExecRC); 5744 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) 5745 .addReg(CondReg) 5746 .addReg(NewCondReg); 5747 CondReg = AndReg; 5748 } 5749 5750 // Update ScalarOp operand to use the SGPR ScalarOp. 5751 ScalarOp->setReg(CurReg); 5752 ScalarOp->setIsKill(); 5753 } else { 5754 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef()); 5755 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && 5756 "Unhandled register size"); 5757 5758 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { 5759 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5760 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5761 5762 // Read the next variant <- also loop target. 5763 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) 5764 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx)); 5765 5766 // Read the next variant <- also loop target. 5767 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) 5768 .addReg(VScalarOp, VScalarOpUndef, 5769 TRI->getSubRegFromChannel(Idx + 1)); 5770 5771 ReadlanePieces.push_back(CurRegLo); 5772 ReadlanePieces.push_back(CurRegHi); 5773 5774 // Comparison is to be done as 64-bit. 5775 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 5776 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) 5777 .addReg(CurRegLo) 5778 .addImm(AMDGPU::sub0) 5779 .addReg(CurRegHi) 5780 .addImm(AMDGPU::sub1); 5781 5782 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); 5783 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), 5784 NewCondReg) 5785 .addReg(CurReg); 5786 if (NumSubRegs <= 2) 5787 Cmp.addReg(VScalarOp); 5788 else 5789 Cmp.addReg(VScalarOp, VScalarOpUndef, 5790 TRI->getSubRegFromChannel(Idx, 2)); 5791 5792 // Combine the comparison results with AND. 5793 if (!CondReg) // First. 5794 CondReg = NewCondReg; 5795 else { // If not the first, we create an AND. 5796 Register AndReg = MRI.createVirtualRegister(BoolXExecRC); 5797 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) 5798 .addReg(CondReg) 5799 .addReg(NewCondReg); 5800 CondReg = AndReg; 5801 } 5802 } // End for loop. 5803 5804 auto SScalarOpRC = 5805 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp)); 5806 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC); 5807 5808 // Build scalar ScalarOp. 5809 auto Merge = 5810 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp); 5811 unsigned Channel = 0; 5812 for (Register Piece : ReadlanePieces) { 5813 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++)); 5814 } 5815 5816 // Update ScalarOp operand to use the SGPR ScalarOp. 5817 ScalarOp->setReg(SScalarOp); 5818 ScalarOp->setIsKill(); 5819 } 5820 } 5821 5822 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 5823 MRI.setSimpleHint(SaveExec, CondReg); 5824 5825 // Update EXEC to matching lanes, saving original to SaveExec. 5826 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec) 5827 .addReg(CondReg, RegState::Kill); 5828 5829 // The original instruction is here; we insert the terminators after it. 5830 I = BodyBB.end(); 5831 5832 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 5833 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) 5834 .addReg(Exec) 5835 .addReg(SaveExec); 5836 5837 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); 5838 } 5839 5840 // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register 5841 // with SGPRs by iterating over all unique values across all lanes. 5842 // Returns the loop basic block that now contains \p MI. 5843 static MachineBasicBlock * 5844 loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, 5845 ArrayRef<MachineOperand *> ScalarOps, 5846 MachineDominatorTree *MDT, 5847 MachineBasicBlock::iterator Begin = nullptr, 5848 MachineBasicBlock::iterator End = nullptr) { 5849 MachineBasicBlock &MBB = *MI.getParent(); 5850 MachineFunction &MF = *MBB.getParent(); 5851 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 5852 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 5853 MachineRegisterInfo &MRI = MF.getRegInfo(); 5854 if (!Begin.isValid()) 5855 Begin = &MI; 5856 if (!End.isValid()) { 5857 End = &MI; 5858 ++End; 5859 } 5860 const DebugLoc &DL = MI.getDebugLoc(); 5861 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 5862 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 5863 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 5864 5865 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); 5866 5867 // Save the EXEC mask 5868 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec); 5869 5870 // Killed uses in the instruction we are waterfalling around will be 5871 // incorrect due to the added control-flow. 5872 MachineBasicBlock::iterator AfterMI = MI; 5873 ++AfterMI; 5874 for (auto I = Begin; I != AfterMI; I++) { 5875 for (auto &MO : I->all_uses()) 5876 MRI.clearKillFlags(MO.getReg()); 5877 } 5878 5879 // To insert the loop we need to split the block. Move everything after this 5880 // point to a new block, and insert a new empty block between the two. 5881 MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); 5882 MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock(); 5883 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); 5884 MachineFunction::iterator MBBI(MBB); 5885 ++MBBI; 5886 5887 MF.insert(MBBI, LoopBB); 5888 MF.insert(MBBI, BodyBB); 5889 MF.insert(MBBI, RemainderBB); 5890 5891 LoopBB->addSuccessor(BodyBB); 5892 BodyBB->addSuccessor(LoopBB); 5893 BodyBB->addSuccessor(RemainderBB); 5894 5895 // Move Begin to MI to the BodyBB, and the remainder of the block to 5896 // RemainderBB. 5897 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 5898 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end()); 5899 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end()); 5900 5901 MBB.addSuccessor(LoopBB); 5902 5903 // Update dominators. We know that MBB immediately dominates LoopBB, that 5904 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates 5905 // RemainderBB. RemainderBB immediately dominates all of the successors 5906 // transferred to it from MBB that MBB used to properly dominate. 5907 if (MDT) { 5908 MDT->addNewBlock(LoopBB, &MBB); 5909 MDT->addNewBlock(BodyBB, LoopBB); 5910 MDT->addNewBlock(RemainderBB, BodyBB); 5911 for (auto &Succ : RemainderBB->successors()) { 5912 if (MDT->properlyDominates(&MBB, Succ)) { 5913 MDT->changeImmediateDominator(Succ, RemainderBB); 5914 } 5915 } 5916 } 5917 5918 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps); 5919 5920 // Restore the EXEC mask 5921 MachineBasicBlock::iterator First = RemainderBB->begin(); 5922 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); 5923 return BodyBB; 5924 } 5925 5926 // Extract pointer from Rsrc and return a zero-value Rsrc replacement. 5927 static std::tuple<unsigned, unsigned> 5928 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { 5929 MachineBasicBlock &MBB = *MI.getParent(); 5930 MachineFunction &MF = *MBB.getParent(); 5931 MachineRegisterInfo &MRI = MF.getRegInfo(); 5932 5933 // Extract the ptr from the resource descriptor. 5934 unsigned RsrcPtr = 5935 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, 5936 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); 5937 5938 // Create an empty resource descriptor 5939 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 5940 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5941 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 5942 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass); 5943 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); 5944 5945 // Zero64 = 0 5946 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) 5947 .addImm(0); 5948 5949 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} 5950 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) 5951 .addImm(RsrcDataFormat & 0xFFFFFFFF); 5952 5953 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} 5954 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) 5955 .addImm(RsrcDataFormat >> 32); 5956 5957 // NewSRsrc = {Zero64, SRsrcFormat} 5958 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) 5959 .addReg(Zero64) 5960 .addImm(AMDGPU::sub0_sub1) 5961 .addReg(SRsrcFormatLo) 5962 .addImm(AMDGPU::sub2) 5963 .addReg(SRsrcFormatHi) 5964 .addImm(AMDGPU::sub3); 5965 5966 return std::tuple(RsrcPtr, NewSRsrc); 5967 } 5968 5969 MachineBasicBlock * 5970 SIInstrInfo::legalizeOperands(MachineInstr &MI, 5971 MachineDominatorTree *MDT) const { 5972 MachineFunction &MF = *MI.getParent()->getParent(); 5973 MachineRegisterInfo &MRI = MF.getRegInfo(); 5974 MachineBasicBlock *CreatedBB = nullptr; 5975 5976 // Legalize VOP2 5977 if (isVOP2(MI) || isVOPC(MI)) { 5978 legalizeOperandsVOP2(MRI, MI); 5979 return CreatedBB; 5980 } 5981 5982 // Legalize VOP3 5983 if (isVOP3(MI)) { 5984 legalizeOperandsVOP3(MRI, MI); 5985 return CreatedBB; 5986 } 5987 5988 // Legalize SMRD 5989 if (isSMRD(MI)) { 5990 legalizeOperandsSMRD(MRI, MI); 5991 return CreatedBB; 5992 } 5993 5994 // Legalize FLAT 5995 if (isFLAT(MI)) { 5996 legalizeOperandsFLAT(MRI, MI); 5997 return CreatedBB; 5998 } 5999 6000 // Legalize REG_SEQUENCE and PHI 6001 // The register class of the operands much be the same type as the register 6002 // class of the output. 6003 if (MI.getOpcode() == AMDGPU::PHI) { 6004 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; 6005 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { 6006 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual()) 6007 continue; 6008 const TargetRegisterClass *OpRC = 6009 MRI.getRegClass(MI.getOperand(i).getReg()); 6010 if (RI.hasVectorRegisters(OpRC)) { 6011 VRC = OpRC; 6012 } else { 6013 SRC = OpRC; 6014 } 6015 } 6016 6017 // If any of the operands are VGPR registers, then they all most be 6018 // otherwise we will create illegal VGPR->SGPR copies when legalizing 6019 // them. 6020 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { 6021 if (!VRC) { 6022 assert(SRC); 6023 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { 6024 VRC = &AMDGPU::VReg_1RegClass; 6025 } else 6026 VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) 6027 ? RI.getEquivalentAGPRClass(SRC) 6028 : RI.getEquivalentVGPRClass(SRC); 6029 } else { 6030 VRC = RI.isAGPRClass(getOpRegClass(MI, 0)) 6031 ? RI.getEquivalentAGPRClass(VRC) 6032 : RI.getEquivalentVGPRClass(VRC); 6033 } 6034 RC = VRC; 6035 } else { 6036 RC = SRC; 6037 } 6038 6039 // Update all the operands so they have the same type. 6040 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 6041 MachineOperand &Op = MI.getOperand(I); 6042 if (!Op.isReg() || !Op.getReg().isVirtual()) 6043 continue; 6044 6045 // MI is a PHI instruction. 6046 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); 6047 MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); 6048 6049 // Avoid creating no-op copies with the same src and dst reg class. These 6050 // confuse some of the machine passes. 6051 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); 6052 } 6053 } 6054 6055 // REG_SEQUENCE doesn't really require operand legalization, but if one has a 6056 // VGPR dest type and SGPR sources, insert copies so all operands are 6057 // VGPRs. This seems to help operand folding / the register coalescer. 6058 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { 6059 MachineBasicBlock *MBB = MI.getParent(); 6060 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); 6061 if (RI.hasVGPRs(DstRC)) { 6062 // Update all the operands so they are VGPR register classes. These may 6063 // not be the same register class because REG_SEQUENCE supports mixing 6064 // subregister index types e.g. sub0_sub1 + sub2 + sub3 6065 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 6066 MachineOperand &Op = MI.getOperand(I); 6067 if (!Op.isReg() || !Op.getReg().isVirtual()) 6068 continue; 6069 6070 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); 6071 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); 6072 if (VRC == OpRC) 6073 continue; 6074 6075 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); 6076 Op.setIsKill(); 6077 } 6078 } 6079 6080 return CreatedBB; 6081 } 6082 6083 // Legalize INSERT_SUBREG 6084 // src0 must have the same register class as dst 6085 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { 6086 Register Dst = MI.getOperand(0).getReg(); 6087 Register Src0 = MI.getOperand(1).getReg(); 6088 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); 6089 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); 6090 if (DstRC != Src0RC) { 6091 MachineBasicBlock *MBB = MI.getParent(); 6092 MachineOperand &Op = MI.getOperand(1); 6093 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); 6094 } 6095 return CreatedBB; 6096 } 6097 6098 // Legalize SI_INIT_M0 6099 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { 6100 MachineOperand &Src = MI.getOperand(0); 6101 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) 6102 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); 6103 return CreatedBB; 6104 } 6105 6106 // Legalize MIMG and MUBUF/MTBUF for shaders. 6107 // 6108 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via 6109 // scratch memory access. In both cases, the legalization never involves 6110 // conversion to the addr64 form. 6111 if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) && 6112 (isMUBUF(MI) || isMTBUF(MI)))) { 6113 MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); 6114 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) 6115 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); 6116 6117 MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); 6118 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) 6119 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); 6120 6121 return CreatedBB; 6122 } 6123 6124 // Legalize SI_CALL 6125 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { 6126 MachineOperand *Dest = &MI.getOperand(0); 6127 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { 6128 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and 6129 // following copies, we also need to move copies from and to physical 6130 // registers into the loop block. 6131 unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); 6132 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); 6133 6134 // Also move the copies to physical registers into the loop block 6135 MachineBasicBlock &MBB = *MI.getParent(); 6136 MachineBasicBlock::iterator Start(&MI); 6137 while (Start->getOpcode() != FrameSetupOpcode) 6138 --Start; 6139 MachineBasicBlock::iterator End(&MI); 6140 while (End->getOpcode() != FrameDestroyOpcode) 6141 ++End; 6142 // Also include following copies of the return value 6143 ++End; 6144 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && 6145 MI.definesRegister(End->getOperand(1).getReg())) 6146 ++End; 6147 CreatedBB = 6148 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); 6149 } 6150 } 6151 6152 // Legalize MUBUF instructions. 6153 bool isSoffsetLegal = true; 6154 int SoffsetIdx = 6155 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset); 6156 if (SoffsetIdx != -1) { 6157 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx); 6158 if (Soffset->isReg() && 6159 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) { 6160 isSoffsetLegal = false; 6161 } 6162 } 6163 6164 bool isRsrcLegal = true; 6165 int RsrcIdx = 6166 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); 6167 if (RsrcIdx != -1) { 6168 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 6169 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) { 6170 isRsrcLegal = false; 6171 } 6172 } 6173 6174 // The operands are legal. 6175 if (isRsrcLegal && isSoffsetLegal) 6176 return CreatedBB; 6177 6178 if (!isRsrcLegal) { 6179 // Legalize a VGPR Rsrc 6180 // 6181 // If the instruction is _ADDR64, we can avoid a waterfall by extracting 6182 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using 6183 // a zero-value SRsrc. 6184 // 6185 // If the instruction is _OFFSET (both idxen and offen disabled), and we 6186 // support ADDR64 instructions, we can convert to ADDR64 and do the same as 6187 // above. 6188 // 6189 // Otherwise we are on non-ADDR64 hardware, and/or we have 6190 // idxen/offen/bothen and we fall back to a waterfall loop. 6191 6192 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); 6193 MachineBasicBlock &MBB = *MI.getParent(); 6194 6195 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 6196 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { 6197 // This is already an ADDR64 instruction so we need to add the pointer 6198 // extracted from the resource descriptor to the current value of VAddr. 6199 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6200 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6201 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6202 6203 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 6204 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); 6205 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); 6206 6207 unsigned RsrcPtr, NewSRsrc; 6208 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 6209 6210 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 6211 const DebugLoc &DL = MI.getDebugLoc(); 6212 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo) 6213 .addDef(CondReg0) 6214 .addReg(RsrcPtr, 0, AMDGPU::sub0) 6215 .addReg(VAddr->getReg(), 0, AMDGPU::sub0) 6216 .addImm(0); 6217 6218 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 6219 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi) 6220 .addDef(CondReg1, RegState::Dead) 6221 .addReg(RsrcPtr, 0, AMDGPU::sub1) 6222 .addReg(VAddr->getReg(), 0, AMDGPU::sub1) 6223 .addReg(CondReg0, RegState::Kill) 6224 .addImm(0); 6225 6226 // NewVaddr = {NewVaddrHi, NewVaddrLo} 6227 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) 6228 .addReg(NewVAddrLo) 6229 .addImm(AMDGPU::sub0) 6230 .addReg(NewVAddrHi) 6231 .addImm(AMDGPU::sub1); 6232 6233 VAddr->setReg(NewVAddr); 6234 Rsrc->setReg(NewSRsrc); 6235 } else if (!VAddr && ST.hasAddr64()) { 6236 // This instructions is the _OFFSET variant, so we need to convert it to 6237 // ADDR64. 6238 assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && 6239 "FIXME: Need to emit flat atomics here"); 6240 6241 unsigned RsrcPtr, NewSRsrc; 6242 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); 6243 6244 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 6245 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); 6246 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); 6247 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 6248 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); 6249 6250 // Atomics with return have an additional tied operand and are 6251 // missing some of the special bits. 6252 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); 6253 MachineInstr *Addr64; 6254 6255 if (!VDataIn) { 6256 // Regular buffer load / store. 6257 MachineInstrBuilder MIB = 6258 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 6259 .add(*VData) 6260 .addReg(NewVAddr) 6261 .addReg(NewSRsrc) 6262 .add(*SOffset) 6263 .add(*Offset); 6264 6265 if (const MachineOperand *CPol = 6266 getNamedOperand(MI, AMDGPU::OpName::cpol)) { 6267 MIB.addImm(CPol->getImm()); 6268 } 6269 6270 if (const MachineOperand *TFE = 6271 getNamedOperand(MI, AMDGPU::OpName::tfe)) { 6272 MIB.addImm(TFE->getImm()); 6273 } 6274 6275 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); 6276 6277 MIB.cloneMemRefs(MI); 6278 Addr64 = MIB; 6279 } else { 6280 // Atomics with return. 6281 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) 6282 .add(*VData) 6283 .add(*VDataIn) 6284 .addReg(NewVAddr) 6285 .addReg(NewSRsrc) 6286 .add(*SOffset) 6287 .add(*Offset) 6288 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol)) 6289 .cloneMemRefs(MI); 6290 } 6291 6292 MI.removeFromParent(); 6293 6294 // NewVaddr = {NewVaddrHi, NewVaddrLo} 6295 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), 6296 NewVAddr) 6297 .addReg(RsrcPtr, 0, AMDGPU::sub0) 6298 .addImm(AMDGPU::sub0) 6299 .addReg(RsrcPtr, 0, AMDGPU::sub1) 6300 .addImm(AMDGPU::sub1); 6301 } else { 6302 // Legalize a VGPR Rsrc and soffset together. 6303 if (!isSoffsetLegal) { 6304 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 6305 CreatedBB = 6306 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT); 6307 return CreatedBB; 6308 } 6309 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); 6310 return CreatedBB; 6311 } 6312 } 6313 6314 // Legalize a VGPR soffset. 6315 if (!isSoffsetLegal) { 6316 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); 6317 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); 6318 return CreatedBB; 6319 } 6320 return CreatedBB; 6321 } 6322 6323 void SIInstrWorklist::insert(MachineInstr *MI) { 6324 InstrList.insert(MI); 6325 // Add MBUF instructiosn to deferred list. 6326 int RsrcIdx = 6327 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); 6328 if (RsrcIdx != -1) { 6329 DeferredList.insert(MI); 6330 } 6331 } 6332 6333 bool SIInstrWorklist::isDeferred(MachineInstr *MI) { 6334 return DeferredList.contains(MI); 6335 } 6336 6337 void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, 6338 MachineDominatorTree *MDT) const { 6339 6340 while (!Worklist.empty()) { 6341 MachineInstr &Inst = *Worklist.top(); 6342 Worklist.erase_top(); 6343 // Skip MachineInstr in the deferred list. 6344 if (Worklist.isDeferred(&Inst)) 6345 continue; 6346 moveToVALUImpl(Worklist, MDT, Inst); 6347 } 6348 6349 // Deferred list of instructions will be processed once 6350 // all the MachineInstr in the worklist are done. 6351 for (MachineInstr *Inst : Worklist.getDeferredList()) { 6352 moveToVALUImpl(Worklist, MDT, *Inst); 6353 assert(Worklist.empty() && 6354 "Deferred MachineInstr are not supposed to re-populate worklist"); 6355 } 6356 } 6357 6358 void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, 6359 MachineDominatorTree *MDT, 6360 MachineInstr &Inst) const { 6361 6362 MachineBasicBlock *MBB = Inst.getParent(); 6363 if (!MBB) 6364 return; 6365 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 6366 unsigned Opcode = Inst.getOpcode(); 6367 unsigned NewOpcode = getVALUOp(Inst); 6368 // Handle some special cases 6369 switch (Opcode) { 6370 default: 6371 break; 6372 case AMDGPU::S_ADD_U64_PSEUDO: 6373 case AMDGPU::S_SUB_U64_PSEUDO: 6374 splitScalar64BitAddSub(Worklist, Inst, MDT); 6375 Inst.eraseFromParent(); 6376 return; 6377 case AMDGPU::S_ADD_I32: 6378 case AMDGPU::S_SUB_I32: { 6379 // FIXME: The u32 versions currently selected use the carry. 6380 bool Changed; 6381 MachineBasicBlock *CreatedBBTmp = nullptr; 6382 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); 6383 if (Changed) 6384 return; 6385 6386 // Default handling 6387 break; 6388 } 6389 case AMDGPU::S_AND_B64: 6390 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); 6391 Inst.eraseFromParent(); 6392 return; 6393 6394 case AMDGPU::S_OR_B64: 6395 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); 6396 Inst.eraseFromParent(); 6397 return; 6398 6399 case AMDGPU::S_XOR_B64: 6400 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); 6401 Inst.eraseFromParent(); 6402 return; 6403 6404 case AMDGPU::S_NAND_B64: 6405 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); 6406 Inst.eraseFromParent(); 6407 return; 6408 6409 case AMDGPU::S_NOR_B64: 6410 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); 6411 Inst.eraseFromParent(); 6412 return; 6413 6414 case AMDGPU::S_XNOR_B64: 6415 if (ST.hasDLInsts()) 6416 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); 6417 else 6418 splitScalar64BitXnor(Worklist, Inst, MDT); 6419 Inst.eraseFromParent(); 6420 return; 6421 6422 case AMDGPU::S_ANDN2_B64: 6423 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); 6424 Inst.eraseFromParent(); 6425 return; 6426 6427 case AMDGPU::S_ORN2_B64: 6428 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); 6429 Inst.eraseFromParent(); 6430 return; 6431 6432 case AMDGPU::S_BREV_B64: 6433 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); 6434 Inst.eraseFromParent(); 6435 return; 6436 6437 case AMDGPU::S_NOT_B64: 6438 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); 6439 Inst.eraseFromParent(); 6440 return; 6441 6442 case AMDGPU::S_BCNT1_I32_B64: 6443 splitScalar64BitBCNT(Worklist, Inst); 6444 Inst.eraseFromParent(); 6445 return; 6446 6447 case AMDGPU::S_BFE_I64: 6448 splitScalar64BitBFE(Worklist, Inst); 6449 Inst.eraseFromParent(); 6450 return; 6451 6452 case AMDGPU::S_LSHL_B32: 6453 if (ST.hasOnlyRevVALUShifts()) { 6454 NewOpcode = AMDGPU::V_LSHLREV_B32_e64; 6455 swapOperands(Inst); 6456 } 6457 break; 6458 case AMDGPU::S_ASHR_I32: 6459 if (ST.hasOnlyRevVALUShifts()) { 6460 NewOpcode = AMDGPU::V_ASHRREV_I32_e64; 6461 swapOperands(Inst); 6462 } 6463 break; 6464 case AMDGPU::S_LSHR_B32: 6465 if (ST.hasOnlyRevVALUShifts()) { 6466 NewOpcode = AMDGPU::V_LSHRREV_B32_e64; 6467 swapOperands(Inst); 6468 } 6469 break; 6470 case AMDGPU::S_LSHL_B64: 6471 if (ST.hasOnlyRevVALUShifts()) { 6472 NewOpcode = AMDGPU::V_LSHLREV_B64_e64; 6473 swapOperands(Inst); 6474 } 6475 break; 6476 case AMDGPU::S_ASHR_I64: 6477 if (ST.hasOnlyRevVALUShifts()) { 6478 NewOpcode = AMDGPU::V_ASHRREV_I64_e64; 6479 swapOperands(Inst); 6480 } 6481 break; 6482 case AMDGPU::S_LSHR_B64: 6483 if (ST.hasOnlyRevVALUShifts()) { 6484 NewOpcode = AMDGPU::V_LSHRREV_B64_e64; 6485 swapOperands(Inst); 6486 } 6487 break; 6488 6489 case AMDGPU::S_ABS_I32: 6490 lowerScalarAbs(Worklist, Inst); 6491 Inst.eraseFromParent(); 6492 return; 6493 6494 case AMDGPU::S_CBRANCH_SCC0: 6495 case AMDGPU::S_CBRANCH_SCC1: { 6496 // Clear unused bits of vcc 6497 Register CondReg = Inst.getOperand(1).getReg(); 6498 bool IsSCC = CondReg == AMDGPU::SCC; 6499 Register VCC = RI.getVCC(); 6500 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 6501 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 6502 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) 6503 .addReg(EXEC) 6504 .addReg(IsSCC ? VCC : CondReg); 6505 Inst.removeOperand(1); 6506 } break; 6507 6508 case AMDGPU::S_BFE_U64: 6509 case AMDGPU::S_BFM_B64: 6510 llvm_unreachable("Moving this op to VALU not implemented"); 6511 6512 case AMDGPU::S_PACK_LL_B32_B16: 6513 case AMDGPU::S_PACK_LH_B32_B16: 6514 case AMDGPU::S_PACK_HL_B32_B16: 6515 case AMDGPU::S_PACK_HH_B32_B16: 6516 movePackToVALU(Worklist, MRI, Inst); 6517 Inst.eraseFromParent(); 6518 return; 6519 6520 case AMDGPU::S_XNOR_B32: 6521 lowerScalarXnor(Worklist, Inst); 6522 Inst.eraseFromParent(); 6523 return; 6524 6525 case AMDGPU::S_NAND_B32: 6526 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); 6527 Inst.eraseFromParent(); 6528 return; 6529 6530 case AMDGPU::S_NOR_B32: 6531 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); 6532 Inst.eraseFromParent(); 6533 return; 6534 6535 case AMDGPU::S_ANDN2_B32: 6536 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); 6537 Inst.eraseFromParent(); 6538 return; 6539 6540 case AMDGPU::S_ORN2_B32: 6541 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); 6542 Inst.eraseFromParent(); 6543 return; 6544 6545 // TODO: remove as soon as everything is ready 6546 // to replace VGPR to SGPR copy with V_READFIRSTLANEs. 6547 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO 6548 // can only be selected from the uniform SDNode. 6549 case AMDGPU::S_ADD_CO_PSEUDO: 6550 case AMDGPU::S_SUB_CO_PSEUDO: { 6551 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) 6552 ? AMDGPU::V_ADDC_U32_e64 6553 : AMDGPU::V_SUBB_U32_e64; 6554 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 6555 6556 Register CarryInReg = Inst.getOperand(4).getReg(); 6557 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { 6558 Register NewCarryReg = MRI.createVirtualRegister(CarryRC); 6559 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) 6560 .addReg(CarryInReg); 6561 } 6562 6563 Register CarryOutReg = Inst.getOperand(1).getReg(); 6564 6565 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( 6566 MRI.getRegClass(Inst.getOperand(0).getReg()))); 6567 MachineInstr *CarryOp = 6568 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) 6569 .addReg(CarryOutReg, RegState::Define) 6570 .add(Inst.getOperand(2)) 6571 .add(Inst.getOperand(3)) 6572 .addReg(CarryInReg) 6573 .addImm(0); 6574 legalizeOperands(*CarryOp); 6575 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); 6576 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); 6577 Inst.eraseFromParent(); 6578 } 6579 return; 6580 case AMDGPU::S_UADDO_PSEUDO: 6581 case AMDGPU::S_USUBO_PSEUDO: { 6582 const DebugLoc &DL = Inst.getDebugLoc(); 6583 MachineOperand &Dest0 = Inst.getOperand(0); 6584 MachineOperand &Dest1 = Inst.getOperand(1); 6585 MachineOperand &Src0 = Inst.getOperand(2); 6586 MachineOperand &Src1 = Inst.getOperand(3); 6587 6588 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) 6589 ? AMDGPU::V_ADD_CO_U32_e64 6590 : AMDGPU::V_SUB_CO_U32_e64; 6591 const TargetRegisterClass *NewRC = 6592 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); 6593 Register DestReg = MRI.createVirtualRegister(NewRC); 6594 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) 6595 .addReg(Dest1.getReg(), RegState::Define) 6596 .add(Src0) 6597 .add(Src1) 6598 .addImm(0); // clamp bit 6599 6600 legalizeOperands(*NewInstr, MDT); 6601 MRI.replaceRegWith(Dest0.getReg(), DestReg); 6602 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, 6603 Worklist); 6604 Inst.eraseFromParent(); 6605 } 6606 return; 6607 6608 case AMDGPU::S_CSELECT_B32: 6609 case AMDGPU::S_CSELECT_B64: 6610 lowerSelect(Worklist, Inst, MDT); 6611 Inst.eraseFromParent(); 6612 return; 6613 case AMDGPU::S_CMP_EQ_I32: 6614 case AMDGPU::S_CMP_LG_I32: 6615 case AMDGPU::S_CMP_GT_I32: 6616 case AMDGPU::S_CMP_GE_I32: 6617 case AMDGPU::S_CMP_LT_I32: 6618 case AMDGPU::S_CMP_LE_I32: 6619 case AMDGPU::S_CMP_EQ_U32: 6620 case AMDGPU::S_CMP_LG_U32: 6621 case AMDGPU::S_CMP_GT_U32: 6622 case AMDGPU::S_CMP_GE_U32: 6623 case AMDGPU::S_CMP_LT_U32: 6624 case AMDGPU::S_CMP_LE_U32: 6625 case AMDGPU::S_CMP_EQ_U64: 6626 case AMDGPU::S_CMP_LG_U64: { 6627 const MCInstrDesc &NewDesc = get(NewOpcode); 6628 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); 6629 MachineInstr *NewInstr = 6630 BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) 6631 .add(Inst.getOperand(0)) 6632 .add(Inst.getOperand(1)); 6633 legalizeOperands(*NewInstr, MDT); 6634 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); 6635 MachineOperand SCCOp = Inst.getOperand(SCCIdx); 6636 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); 6637 Inst.eraseFromParent(); 6638 } 6639 return; 6640 } 6641 6642 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { 6643 // We cannot move this instruction to the VALU, so we should try to 6644 // legalize its operands instead. 6645 legalizeOperands(Inst, MDT); 6646 return; 6647 } 6648 // Handle converting generic instructions like COPY-to-SGPR into 6649 // COPY-to-VGPR. 6650 if (NewOpcode == Opcode) { 6651 Register DstReg = Inst.getOperand(0).getReg(); 6652 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); 6653 6654 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && 6655 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { 6656 // Instead of creating a copy where src and dst are the same register 6657 // class, we just replace all uses of dst with src. These kinds of 6658 // copies interfere with the heuristics MachineSink uses to decide 6659 // whether or not to split a critical edge. Since the pass assumes 6660 // that copies will end up as machine instructions and not be 6661 // eliminated. 6662 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); 6663 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); 6664 MRI.clearKillFlags(Inst.getOperand(1).getReg()); 6665 Inst.getOperand(0).setReg(DstReg); 6666 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally 6667 // these are deleted later, but at -O0 it would leave a suspicious 6668 // looking illegal copy of an undef register. 6669 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) 6670 Inst.removeOperand(I); 6671 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); 6672 return; 6673 } 6674 Register NewDstReg = MRI.createVirtualRegister(NewDstRC); 6675 MRI.replaceRegWith(DstReg, NewDstReg); 6676 legalizeOperands(Inst, MDT); 6677 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 6678 return; 6679 } 6680 6681 // Use the new VALU Opcode. 6682 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) 6683 .setMIFlags(Inst.getFlags()); 6684 for (const MachineOperand &Op : Inst.explicit_operands()) 6685 NewInstr->addOperand(Op); 6686 // Remove any references to SCC. Vector instructions can't read from it, and 6687 // We're just about to add the implicit use / defs of VCC, and we don't want 6688 // both. 6689 for (MachineOperand &Op : Inst.implicit_operands()) { 6690 if (Op.getReg() == AMDGPU::SCC) { 6691 // Only propagate through live-def of SCC. 6692 if (Op.isDef() && !Op.isDead()) 6693 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); 6694 if (Op.isUse()) 6695 addSCCDefsToVALUWorklist(NewInstr, Worklist); 6696 } 6697 } 6698 Inst.eraseFromParent(); 6699 Register NewDstReg; 6700 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { 6701 Register DstReg = NewInstr->getOperand(0).getReg(); 6702 assert(DstReg.isVirtual()); 6703 // Update the destination register class. 6704 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr); 6705 assert(NewDstRC); 6706 NewDstReg = MRI.createVirtualRegister(NewDstRC); 6707 MRI.replaceRegWith(DstReg, NewDstReg); 6708 } 6709 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { 6710 // We are converting these to a BFE, so we need to add the missing 6711 // operands for the size and offset. 6712 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; 6713 NewInstr.addImm(0); 6714 NewInstr.addImm(Size); 6715 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { 6716 // The VALU version adds the second operand to the result, so insert an 6717 // extra 0 operand. 6718 NewInstr.addImm(0); 6719 } 6720 if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { 6721 const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); 6722 // If we need to move this to VGPRs, we need to unpack the second operand 6723 // back into the 2 separate ones for bit offset and width. 6724 assert(OffsetWidthOp.isImm() && 6725 "Scalar BFE is only implemented for constant width and offset"); 6726 uint32_t Imm = OffsetWidthOp.getImm(); 6727 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 6728 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 6729 NewInstr->removeOperand(2); 6730 NewInstr.addImm(Offset); 6731 NewInstr.addImm(BitWidth); 6732 } 6733 fixImplicitOperands(*NewInstr); 6734 // Legalize the operands 6735 legalizeOperands(*NewInstr, MDT); 6736 if (NewDstReg) 6737 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); 6738 } 6739 6740 // Add/sub require special handling to deal with carry outs. 6741 std::pair<bool, MachineBasicBlock *> 6742 SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, 6743 MachineDominatorTree *MDT) const { 6744 if (ST.hasAddNoCarry()) { 6745 // Assume there is no user of scc since we don't select this in that case. 6746 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant 6747 // is used. 6748 6749 MachineBasicBlock &MBB = *Inst.getParent(); 6750 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6751 6752 Register OldDstReg = Inst.getOperand(0).getReg(); 6753 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6754 6755 unsigned Opc = Inst.getOpcode(); 6756 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); 6757 6758 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? 6759 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; 6760 6761 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); 6762 Inst.removeOperand(3); 6763 6764 Inst.setDesc(get(NewOpc)); 6765 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit 6766 Inst.addImplicitDefUseOperands(*MBB.getParent()); 6767 MRI.replaceRegWith(OldDstReg, ResultReg); 6768 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT); 6769 6770 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6771 return std::pair(true, NewBB); 6772 } 6773 6774 return std::pair(false, nullptr); 6775 } 6776 6777 void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, 6778 MachineDominatorTree *MDT) const { 6779 6780 MachineBasicBlock &MBB = *Inst.getParent(); 6781 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6782 MachineBasicBlock::iterator MII = Inst; 6783 DebugLoc DL = Inst.getDebugLoc(); 6784 6785 MachineOperand &Dest = Inst.getOperand(0); 6786 MachineOperand &Src0 = Inst.getOperand(1); 6787 MachineOperand &Src1 = Inst.getOperand(2); 6788 MachineOperand &Cond = Inst.getOperand(3); 6789 6790 Register SCCSource = Cond.getReg(); 6791 bool IsSCC = (SCCSource == AMDGPU::SCC); 6792 6793 // If this is a trivial select where the condition is effectively not SCC 6794 // (SCCSource is a source of copy to SCC), then the select is semantically 6795 // equivalent to copying SCCSource. Hence, there is no need to create 6796 // V_CNDMASK, we can just use that and bail out. 6797 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && 6798 (Src1.getImm() == 0)) { 6799 MRI.replaceRegWith(Dest.getReg(), SCCSource); 6800 return; 6801 } 6802 6803 const TargetRegisterClass *TC = 6804 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 6805 6806 Register CopySCC = MRI.createVirtualRegister(TC); 6807 6808 if (IsSCC) { 6809 // Now look for the closest SCC def if it is a copy 6810 // replacing the SCCSource with the COPY source register 6811 bool CopyFound = false; 6812 for (MachineInstr &CandI : 6813 make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), 6814 Inst.getParent()->rend())) { 6815 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != 6816 -1) { 6817 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { 6818 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC) 6819 .addReg(CandI.getOperand(1).getReg()); 6820 CopyFound = true; 6821 } 6822 break; 6823 } 6824 } 6825 if (!CopyFound) { 6826 // SCC def is not a copy 6827 // Insert a trivial select instead of creating a copy, because a copy from 6828 // SCC would semantically mean just copying a single bit, but we may need 6829 // the result to be a vector condition mask that needs preserving. 6830 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 6831 : AMDGPU::S_CSELECT_B32; 6832 auto NewSelect = 6833 BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); 6834 NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); 6835 } 6836 } 6837 6838 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6839 6840 auto UpdatedInst = 6841 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) 6842 .addImm(0) 6843 .add(Src1) // False 6844 .addImm(0) 6845 .add(Src0) // True 6846 .addReg(IsSCC ? CopySCC : SCCSource); 6847 6848 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6849 legalizeOperands(*UpdatedInst, MDT); 6850 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6851 } 6852 6853 void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, 6854 MachineInstr &Inst) const { 6855 MachineBasicBlock &MBB = *Inst.getParent(); 6856 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6857 MachineBasicBlock::iterator MII = Inst; 6858 DebugLoc DL = Inst.getDebugLoc(); 6859 6860 MachineOperand &Dest = Inst.getOperand(0); 6861 MachineOperand &Src = Inst.getOperand(1); 6862 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6863 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6864 6865 unsigned SubOp = ST.hasAddNoCarry() ? 6866 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32; 6867 6868 BuildMI(MBB, MII, DL, get(SubOp), TmpReg) 6869 .addImm(0) 6870 .addReg(Src.getReg()); 6871 6872 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) 6873 .addReg(Src.getReg()) 6874 .addReg(TmpReg); 6875 6876 MRI.replaceRegWith(Dest.getReg(), ResultReg); 6877 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 6878 } 6879 6880 void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, 6881 MachineInstr &Inst) const { 6882 MachineBasicBlock &MBB = *Inst.getParent(); 6883 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6884 MachineBasicBlock::iterator MII = Inst; 6885 const DebugLoc &DL = Inst.getDebugLoc(); 6886 6887 MachineOperand &Dest = Inst.getOperand(0); 6888 MachineOperand &Src0 = Inst.getOperand(1); 6889 MachineOperand &Src1 = Inst.getOperand(2); 6890 6891 if (ST.hasDLInsts()) { 6892 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 6893 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); 6894 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); 6895 6896 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) 6897 .add(Src0) 6898 .add(Src1); 6899 6900 MRI.replaceRegWith(Dest.getReg(), NewDest); 6901 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6902 } else { 6903 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can 6904 // invert either source and then perform the XOR. If either source is a 6905 // scalar register, then we can leave the inversion on the scalar unit to 6906 // achieve a better distribution of scalar and vector instructions. 6907 bool Src0IsSGPR = Src0.isReg() && 6908 RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); 6909 bool Src1IsSGPR = Src1.isReg() && 6910 RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); 6911 MachineInstr *Xor; 6912 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6913 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6914 6915 // Build a pair of scalar instructions and add them to the work list. 6916 // The next iteration over the work list will lower these to the vector 6917 // unit as necessary. 6918 if (Src0IsSGPR) { 6919 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0); 6920 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 6921 .addReg(Temp) 6922 .add(Src1); 6923 } else if (Src1IsSGPR) { 6924 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1); 6925 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) 6926 .add(Src0) 6927 .addReg(Temp); 6928 } else { 6929 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) 6930 .add(Src0) 6931 .add(Src1); 6932 MachineInstr *Not = 6933 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp); 6934 Worklist.insert(Not); 6935 } 6936 6937 MRI.replaceRegWith(Dest.getReg(), NewDest); 6938 6939 Worklist.insert(Xor); 6940 6941 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6942 } 6943 } 6944 6945 void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist, 6946 MachineInstr &Inst, 6947 unsigned Opcode) const { 6948 MachineBasicBlock &MBB = *Inst.getParent(); 6949 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6950 MachineBasicBlock::iterator MII = Inst; 6951 const DebugLoc &DL = Inst.getDebugLoc(); 6952 6953 MachineOperand &Dest = Inst.getOperand(0); 6954 MachineOperand &Src0 = Inst.getOperand(1); 6955 MachineOperand &Src1 = Inst.getOperand(2); 6956 6957 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6958 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 6959 6960 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) 6961 .add(Src0) 6962 .add(Src1); 6963 6964 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) 6965 .addReg(Interm); 6966 6967 Worklist.insert(&Op); 6968 Worklist.insert(&Not); 6969 6970 MRI.replaceRegWith(Dest.getReg(), NewDest); 6971 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 6972 } 6973 6974 void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist, 6975 MachineInstr &Inst, 6976 unsigned Opcode) const { 6977 MachineBasicBlock &MBB = *Inst.getParent(); 6978 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 6979 MachineBasicBlock::iterator MII = Inst; 6980 const DebugLoc &DL = Inst.getDebugLoc(); 6981 6982 MachineOperand &Dest = Inst.getOperand(0); 6983 MachineOperand &Src0 = Inst.getOperand(1); 6984 MachineOperand &Src1 = Inst.getOperand(2); 6985 6986 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 6987 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 6988 6989 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) 6990 .add(Src1); 6991 6992 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) 6993 .add(Src0) 6994 .addReg(Interm); 6995 6996 Worklist.insert(&Not); 6997 Worklist.insert(&Op); 6998 6999 MRI.replaceRegWith(Dest.getReg(), NewDest); 7000 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); 7001 } 7002 7003 void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, 7004 MachineInstr &Inst, unsigned Opcode, 7005 bool Swap) const { 7006 MachineBasicBlock &MBB = *Inst.getParent(); 7007 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7008 7009 MachineOperand &Dest = Inst.getOperand(0); 7010 MachineOperand &Src0 = Inst.getOperand(1); 7011 DebugLoc DL = Inst.getDebugLoc(); 7012 7013 MachineBasicBlock::iterator MII = Inst; 7014 7015 const MCInstrDesc &InstDesc = get(Opcode); 7016 const TargetRegisterClass *Src0RC = Src0.isReg() ? 7017 MRI.getRegClass(Src0.getReg()) : 7018 &AMDGPU::SGPR_32RegClass; 7019 7020 const TargetRegisterClass *Src0SubRC = 7021 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); 7022 7023 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 7024 AMDGPU::sub0, Src0SubRC); 7025 7026 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 7027 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 7028 const TargetRegisterClass *NewDestSubRC = 7029 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0); 7030 7031 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 7032 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); 7033 7034 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 7035 AMDGPU::sub1, Src0SubRC); 7036 7037 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 7038 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); 7039 7040 if (Swap) 7041 std::swap(DestSub0, DestSub1); 7042 7043 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 7044 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 7045 .addReg(DestSub0) 7046 .addImm(AMDGPU::sub0) 7047 .addReg(DestSub1) 7048 .addImm(AMDGPU::sub1); 7049 7050 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 7051 7052 Worklist.insert(&LoHalf); 7053 Worklist.insert(&HiHalf); 7054 7055 // We don't need to legalizeOperands here because for a single operand, src0 7056 // will support any kind of input. 7057 7058 // Move all users of this moved value. 7059 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 7060 } 7061 7062 void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist, 7063 MachineInstr &Inst, 7064 MachineDominatorTree *MDT) const { 7065 bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); 7066 7067 MachineBasicBlock &MBB = *Inst.getParent(); 7068 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7069 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 7070 7071 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 7072 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7073 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7074 7075 Register CarryReg = MRI.createVirtualRegister(CarryRC); 7076 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); 7077 7078 MachineOperand &Dest = Inst.getOperand(0); 7079 MachineOperand &Src0 = Inst.getOperand(1); 7080 MachineOperand &Src1 = Inst.getOperand(2); 7081 const DebugLoc &DL = Inst.getDebugLoc(); 7082 MachineBasicBlock::iterator MII = Inst; 7083 7084 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); 7085 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); 7086 const TargetRegisterClass *Src0SubRC = 7087 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); 7088 const TargetRegisterClass *Src1SubRC = 7089 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); 7090 7091 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 7092 AMDGPU::sub0, Src0SubRC); 7093 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 7094 AMDGPU::sub0, Src1SubRC); 7095 7096 7097 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 7098 AMDGPU::sub1, Src0SubRC); 7099 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 7100 AMDGPU::sub1, Src1SubRC); 7101 7102 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; 7103 MachineInstr *LoHalf = 7104 BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) 7105 .addReg(CarryReg, RegState::Define) 7106 .add(SrcReg0Sub0) 7107 .add(SrcReg1Sub0) 7108 .addImm(0); // clamp bit 7109 7110 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; 7111 MachineInstr *HiHalf = 7112 BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) 7113 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 7114 .add(SrcReg0Sub1) 7115 .add(SrcReg1Sub1) 7116 .addReg(CarryReg, RegState::Kill) 7117 .addImm(0); // clamp bit 7118 7119 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 7120 .addReg(DestSub0) 7121 .addImm(AMDGPU::sub0) 7122 .addReg(DestSub1) 7123 .addImm(AMDGPU::sub1); 7124 7125 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 7126 7127 // Try to legalize the operands in case we need to swap the order to keep it 7128 // valid. 7129 legalizeOperands(*LoHalf, MDT); 7130 legalizeOperands(*HiHalf, MDT); 7131 7132 // Move all users of this moved value. 7133 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 7134 } 7135 7136 void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, 7137 MachineInstr &Inst, unsigned Opcode, 7138 MachineDominatorTree *MDT) const { 7139 MachineBasicBlock &MBB = *Inst.getParent(); 7140 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7141 7142 MachineOperand &Dest = Inst.getOperand(0); 7143 MachineOperand &Src0 = Inst.getOperand(1); 7144 MachineOperand &Src1 = Inst.getOperand(2); 7145 DebugLoc DL = Inst.getDebugLoc(); 7146 7147 MachineBasicBlock::iterator MII = Inst; 7148 7149 const MCInstrDesc &InstDesc = get(Opcode); 7150 const TargetRegisterClass *Src0RC = Src0.isReg() ? 7151 MRI.getRegClass(Src0.getReg()) : 7152 &AMDGPU::SGPR_32RegClass; 7153 7154 const TargetRegisterClass *Src0SubRC = 7155 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); 7156 const TargetRegisterClass *Src1RC = Src1.isReg() ? 7157 MRI.getRegClass(Src1.getReg()) : 7158 &AMDGPU::SGPR_32RegClass; 7159 7160 const TargetRegisterClass *Src1SubRC = 7161 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); 7162 7163 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 7164 AMDGPU::sub0, Src0SubRC); 7165 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 7166 AMDGPU::sub0, Src1SubRC); 7167 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, 7168 AMDGPU::sub1, Src0SubRC); 7169 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, 7170 AMDGPU::sub1, Src1SubRC); 7171 7172 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 7173 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); 7174 const TargetRegisterClass *NewDestSubRC = 7175 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0); 7176 7177 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC); 7178 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) 7179 .add(SrcReg0Sub0) 7180 .add(SrcReg1Sub0); 7181 7182 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC); 7183 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) 7184 .add(SrcReg0Sub1) 7185 .add(SrcReg1Sub1); 7186 7187 Register FullDestReg = MRI.createVirtualRegister(NewDestRC); 7188 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) 7189 .addReg(DestSub0) 7190 .addImm(AMDGPU::sub0) 7191 .addReg(DestSub1) 7192 .addImm(AMDGPU::sub1); 7193 7194 MRI.replaceRegWith(Dest.getReg(), FullDestReg); 7195 7196 Worklist.insert(&LoHalf); 7197 Worklist.insert(&HiHalf); 7198 7199 // Move all users of this moved value. 7200 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); 7201 } 7202 7203 void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist, 7204 MachineInstr &Inst, 7205 MachineDominatorTree *MDT) const { 7206 MachineBasicBlock &MBB = *Inst.getParent(); 7207 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7208 7209 MachineOperand &Dest = Inst.getOperand(0); 7210 MachineOperand &Src0 = Inst.getOperand(1); 7211 MachineOperand &Src1 = Inst.getOperand(2); 7212 const DebugLoc &DL = Inst.getDebugLoc(); 7213 7214 MachineBasicBlock::iterator MII = Inst; 7215 7216 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); 7217 7218 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 7219 7220 MachineOperand* Op0; 7221 MachineOperand* Op1; 7222 7223 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { 7224 Op0 = &Src0; 7225 Op1 = &Src1; 7226 } else { 7227 Op0 = &Src1; 7228 Op1 = &Src0; 7229 } 7230 7231 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) 7232 .add(*Op0); 7233 7234 Register NewDest = MRI.createVirtualRegister(DestRC); 7235 7236 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) 7237 .addReg(Interm) 7238 .add(*Op1); 7239 7240 MRI.replaceRegWith(Dest.getReg(), NewDest); 7241 7242 Worklist.insert(&Xor); 7243 } 7244 7245 void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist, 7246 MachineInstr &Inst) const { 7247 MachineBasicBlock &MBB = *Inst.getParent(); 7248 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7249 7250 MachineBasicBlock::iterator MII = Inst; 7251 const DebugLoc &DL = Inst.getDebugLoc(); 7252 7253 MachineOperand &Dest = Inst.getOperand(0); 7254 MachineOperand &Src = Inst.getOperand(1); 7255 7256 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); 7257 const TargetRegisterClass *SrcRC = Src.isReg() ? 7258 MRI.getRegClass(Src.getReg()) : 7259 &AMDGPU::SGPR_32RegClass; 7260 7261 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7262 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7263 7264 const TargetRegisterClass *SrcSubRC = 7265 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0); 7266 7267 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 7268 AMDGPU::sub0, SrcSubRC); 7269 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, 7270 AMDGPU::sub1, SrcSubRC); 7271 7272 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); 7273 7274 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); 7275 7276 MRI.replaceRegWith(Dest.getReg(), ResultReg); 7277 7278 // We don't need to legalize operands here. src0 for either instruction can be 7279 // an SGPR, and the second input is unused or determined here. 7280 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7281 } 7282 7283 void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, 7284 MachineInstr &Inst) const { 7285 MachineBasicBlock &MBB = *Inst.getParent(); 7286 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7287 MachineBasicBlock::iterator MII = Inst; 7288 const DebugLoc &DL = Inst.getDebugLoc(); 7289 7290 MachineOperand &Dest = Inst.getOperand(0); 7291 uint32_t Imm = Inst.getOperand(2).getImm(); 7292 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. 7293 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. 7294 7295 (void) Offset; 7296 7297 // Only sext_inreg cases handled. 7298 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && 7299 Offset == 0 && "Not implemented"); 7300 7301 if (BitWidth < 32) { 7302 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7303 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7304 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 7305 7306 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo) 7307 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) 7308 .addImm(0) 7309 .addImm(BitWidth); 7310 7311 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) 7312 .addImm(31) 7313 .addReg(MidRegLo); 7314 7315 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 7316 .addReg(MidRegLo) 7317 .addImm(AMDGPU::sub0) 7318 .addReg(MidRegHi) 7319 .addImm(AMDGPU::sub1); 7320 7321 MRI.replaceRegWith(Dest.getReg(), ResultReg); 7322 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7323 return; 7324 } 7325 7326 MachineOperand &Src = Inst.getOperand(1); 7327 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7328 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); 7329 7330 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg) 7331 .addImm(31) 7332 .addReg(Src.getReg(), 0, AMDGPU::sub0); 7333 7334 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg) 7335 .addReg(Src.getReg(), 0, AMDGPU::sub0) 7336 .addImm(AMDGPU::sub0) 7337 .addReg(TmpReg) 7338 .addImm(AMDGPU::sub1); 7339 7340 MRI.replaceRegWith(Dest.getReg(), ResultReg); 7341 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7342 } 7343 7344 void SIInstrInfo::addUsersToMoveToVALUWorklist( 7345 Register DstReg, MachineRegisterInfo &MRI, 7346 SIInstrWorklist &Worklist) const { 7347 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), 7348 E = MRI.use_end(); I != E;) { 7349 MachineInstr &UseMI = *I->getParent(); 7350 7351 unsigned OpNo = 0; 7352 7353 switch (UseMI.getOpcode()) { 7354 case AMDGPU::COPY: 7355 case AMDGPU::WQM: 7356 case AMDGPU::SOFT_WQM: 7357 case AMDGPU::STRICT_WWM: 7358 case AMDGPU::STRICT_WQM: 7359 case AMDGPU::REG_SEQUENCE: 7360 case AMDGPU::PHI: 7361 case AMDGPU::INSERT_SUBREG: 7362 break; 7363 default: 7364 OpNo = I.getOperandNo(); 7365 break; 7366 } 7367 7368 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) { 7369 Worklist.insert(&UseMI); 7370 7371 do { 7372 ++I; 7373 } while (I != E && I->getParent() == &UseMI); 7374 } else { 7375 ++I; 7376 } 7377 } 7378 } 7379 7380 void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, 7381 MachineRegisterInfo &MRI, 7382 MachineInstr &Inst) const { 7383 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7384 MachineBasicBlock *MBB = Inst.getParent(); 7385 MachineOperand &Src0 = Inst.getOperand(1); 7386 MachineOperand &Src1 = Inst.getOperand(2); 7387 const DebugLoc &DL = Inst.getDebugLoc(); 7388 7389 switch (Inst.getOpcode()) { 7390 case AMDGPU::S_PACK_LL_B32_B16: { 7391 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7392 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7393 7394 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are 7395 // 0. 7396 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 7397 .addImm(0xffff); 7398 7399 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) 7400 .addReg(ImmReg, RegState::Kill) 7401 .add(Src0); 7402 7403 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) 7404 .add(Src1) 7405 .addImm(16) 7406 .addReg(TmpReg, RegState::Kill); 7407 break; 7408 } 7409 case AMDGPU::S_PACK_LH_B32_B16: { 7410 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7411 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 7412 .addImm(0xffff); 7413 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg) 7414 .addReg(ImmReg, RegState::Kill) 7415 .add(Src0) 7416 .add(Src1); 7417 break; 7418 } 7419 case AMDGPU::S_PACK_HL_B32_B16: { 7420 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7421 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 7422 .addImm(16) 7423 .add(Src0); 7424 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg) 7425 .add(Src1) 7426 .addImm(16) 7427 .addReg(TmpReg, RegState::Kill); 7428 break; 7429 } 7430 case AMDGPU::S_PACK_HH_B32_B16: { 7431 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7432 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 7433 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) 7434 .addImm(16) 7435 .add(Src0); 7436 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) 7437 .addImm(0xffff0000); 7438 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg) 7439 .add(Src1) 7440 .addReg(ImmReg, RegState::Kill) 7441 .addReg(TmpReg, RegState::Kill); 7442 break; 7443 } 7444 default: 7445 llvm_unreachable("unhandled s_pack_* instruction"); 7446 } 7447 7448 MachineOperand &Dest = Inst.getOperand(0); 7449 MRI.replaceRegWith(Dest.getReg(), ResultReg); 7450 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); 7451 } 7452 7453 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, 7454 MachineInstr &SCCDefInst, 7455 SIInstrWorklist &Worklist, 7456 Register NewCond) const { 7457 7458 // Ensure that def inst defines SCC, which is still live. 7459 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && 7460 !Op.isDead() && Op.getParent() == &SCCDefInst); 7461 SmallVector<MachineInstr *, 4> CopyToDelete; 7462 // This assumes that all the users of SCC are in the same block 7463 // as the SCC def. 7464 for (MachineInstr &MI : // Skip the def inst itself. 7465 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), 7466 SCCDefInst.getParent()->end())) { 7467 // Check if SCC is used first. 7468 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI); 7469 if (SCCIdx != -1) { 7470 if (MI.isCopy()) { 7471 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7472 Register DestReg = MI.getOperand(0).getReg(); 7473 7474 MRI.replaceRegWith(DestReg, NewCond); 7475 CopyToDelete.push_back(&MI); 7476 } else { 7477 7478 if (NewCond.isValid()) 7479 MI.getOperand(SCCIdx).setReg(NewCond); 7480 7481 Worklist.insert(&MI); 7482 } 7483 } 7484 // Exit if we find another SCC def. 7485 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) 7486 break; 7487 } 7488 for (auto &Copy : CopyToDelete) 7489 Copy->eraseFromParent(); 7490 } 7491 7492 // Instructions that use SCC may be converted to VALU instructions. When that 7493 // happens, the SCC register is changed to VCC_LO. The instruction that defines 7494 // SCC must be changed to an instruction that defines VCC. This function makes 7495 // sure that the instruction that defines SCC is added to the moveToVALU 7496 // worklist. 7497 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, 7498 SIInstrWorklist &Worklist) const { 7499 // Look for a preceding instruction that either defines VCC or SCC. If VCC 7500 // then there is nothing to do because the defining instruction has been 7501 // converted to a VALU already. If SCC then that instruction needs to be 7502 // converted to a VALU. 7503 for (MachineInstr &MI : 7504 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)), 7505 SCCUseInst->getParent()->rend())) { 7506 if (MI.modifiesRegister(AMDGPU::VCC, &RI)) 7507 break; 7508 if (MI.definesRegister(AMDGPU::SCC, &RI)) { 7509 Worklist.insert(&MI); 7510 break; 7511 } 7512 } 7513 } 7514 7515 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( 7516 const MachineInstr &Inst) const { 7517 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); 7518 7519 switch (Inst.getOpcode()) { 7520 // For target instructions, getOpRegClass just returns the virtual register 7521 // class associated with the operand, so we need to find an equivalent VGPR 7522 // register class in order to move the instruction to the VALU. 7523 case AMDGPU::COPY: 7524 case AMDGPU::PHI: 7525 case AMDGPU::REG_SEQUENCE: 7526 case AMDGPU::INSERT_SUBREG: 7527 case AMDGPU::WQM: 7528 case AMDGPU::SOFT_WQM: 7529 case AMDGPU::STRICT_WWM: 7530 case AMDGPU::STRICT_WQM: { 7531 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); 7532 if (RI.isAGPRClass(SrcRC)) { 7533 if (RI.isAGPRClass(NewDstRC)) 7534 return nullptr; 7535 7536 switch (Inst.getOpcode()) { 7537 case AMDGPU::PHI: 7538 case AMDGPU::REG_SEQUENCE: 7539 case AMDGPU::INSERT_SUBREG: 7540 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); 7541 break; 7542 default: 7543 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 7544 } 7545 7546 if (!NewDstRC) 7547 return nullptr; 7548 } else { 7549 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) 7550 return nullptr; 7551 7552 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); 7553 if (!NewDstRC) 7554 return nullptr; 7555 } 7556 7557 return NewDstRC; 7558 } 7559 default: 7560 return NewDstRC; 7561 } 7562 } 7563 7564 // Find the one SGPR operand we are allowed to use. 7565 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, 7566 int OpIndices[3]) const { 7567 const MCInstrDesc &Desc = MI.getDesc(); 7568 7569 // Find the one SGPR operand we are allowed to use. 7570 // 7571 // First we need to consider the instruction's operand requirements before 7572 // legalizing. Some operands are required to be SGPRs, such as implicit uses 7573 // of VCC, but we are still bound by the constant bus requirement to only use 7574 // one. 7575 // 7576 // If the operand's class is an SGPR, we can never move it. 7577 7578 Register SGPRReg = findImplicitSGPRRead(MI); 7579 if (SGPRReg) 7580 return SGPRReg; 7581 7582 Register UsedSGPRs[3] = {Register()}; 7583 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 7584 7585 for (unsigned i = 0; i < 3; ++i) { 7586 int Idx = OpIndices[i]; 7587 if (Idx == -1) 7588 break; 7589 7590 const MachineOperand &MO = MI.getOperand(Idx); 7591 if (!MO.isReg()) 7592 continue; 7593 7594 // Is this operand statically required to be an SGPR based on the operand 7595 // constraints? 7596 const TargetRegisterClass *OpRC = 7597 RI.getRegClass(Desc.operands()[Idx].RegClass); 7598 bool IsRequiredSGPR = RI.isSGPRClass(OpRC); 7599 if (IsRequiredSGPR) 7600 return MO.getReg(); 7601 7602 // If this could be a VGPR or an SGPR, Check the dynamic register class. 7603 Register Reg = MO.getReg(); 7604 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); 7605 if (RI.isSGPRClass(RegRC)) 7606 UsedSGPRs[i] = Reg; 7607 } 7608 7609 // We don't have a required SGPR operand, so we have a bit more freedom in 7610 // selecting operands to move. 7611 7612 // Try to select the most used SGPR. If an SGPR is equal to one of the 7613 // others, we choose that. 7614 // 7615 // e.g. 7616 // V_FMA_F32 v0, s0, s0, s0 -> No moves 7617 // V_FMA_F32 v0, s0, s1, s0 -> Move s1 7618 7619 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should 7620 // prefer those. 7621 7622 if (UsedSGPRs[0]) { 7623 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) 7624 SGPRReg = UsedSGPRs[0]; 7625 } 7626 7627 if (!SGPRReg && UsedSGPRs[1]) { 7628 if (UsedSGPRs[1] == UsedSGPRs[2]) 7629 SGPRReg = UsedSGPRs[1]; 7630 } 7631 7632 return SGPRReg; 7633 } 7634 7635 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, 7636 unsigned OperandName) const { 7637 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); 7638 if (Idx == -1) 7639 return nullptr; 7640 7641 return &MI.getOperand(Idx); 7642 } 7643 7644 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { 7645 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { 7646 int64_t Format = ST.getGeneration() >= AMDGPUSubtarget::GFX11 7647 ? (int64_t)AMDGPU::UfmtGFX11::UFMT_32_FLOAT 7648 : (int64_t)AMDGPU::UfmtGFX10::UFMT_32_FLOAT; 7649 return (Format << 44) | 7650 (1ULL << 56) | // RESOURCE_LEVEL = 1 7651 (3ULL << 60); // OOB_SELECT = 3 7652 } 7653 7654 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; 7655 if (ST.isAmdHsaOS()) { 7656 // Set ATC = 1. GFX9 doesn't have this bit. 7657 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) 7658 RsrcDataFormat |= (1ULL << 56); 7659 7660 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. 7661 // BTW, it disables TC L2 and therefore decreases performance. 7662 if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) 7663 RsrcDataFormat |= (2ULL << 59); 7664 } 7665 7666 return RsrcDataFormat; 7667 } 7668 7669 uint64_t SIInstrInfo::getScratchRsrcWords23() const { 7670 uint64_t Rsrc23 = getDefaultRsrcDataFormat() | 7671 AMDGPU::RSRC_TID_ENABLE | 7672 0xffffffff; // Size; 7673 7674 // GFX9 doesn't have ELEMENT_SIZE. 7675 if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 7676 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1; 7677 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; 7678 } 7679 7680 // IndexStride = 64 / 32. 7681 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2; 7682 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; 7683 7684 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. 7685 // Clear them unless we want a huge stride. 7686 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 7687 ST.getGeneration() <= AMDGPUSubtarget::GFX9) 7688 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; 7689 7690 return Rsrc23; 7691 } 7692 7693 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { 7694 unsigned Opc = MI.getOpcode(); 7695 7696 return isSMRD(Opc); 7697 } 7698 7699 bool SIInstrInfo::isHighLatencyDef(int Opc) const { 7700 return get(Opc).mayLoad() && 7701 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); 7702 } 7703 7704 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, 7705 int &FrameIndex) const { 7706 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); 7707 if (!Addr || !Addr->isFI()) 7708 return Register(); 7709 7710 assert(!MI.memoperands_empty() && 7711 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); 7712 7713 FrameIndex = Addr->getIndex(); 7714 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); 7715 } 7716 7717 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, 7718 int &FrameIndex) const { 7719 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); 7720 assert(Addr && Addr->isFI()); 7721 FrameIndex = Addr->getIndex(); 7722 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); 7723 } 7724 7725 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 7726 int &FrameIndex) const { 7727 if (!MI.mayLoad()) 7728 return Register(); 7729 7730 if (isMUBUF(MI) || isVGPRSpill(MI)) 7731 return isStackAccess(MI, FrameIndex); 7732 7733 if (isSGPRSpill(MI)) 7734 return isSGPRStackAccess(MI, FrameIndex); 7735 7736 return Register(); 7737 } 7738 7739 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, 7740 int &FrameIndex) const { 7741 if (!MI.mayStore()) 7742 return Register(); 7743 7744 if (isMUBUF(MI) || isVGPRSpill(MI)) 7745 return isStackAccess(MI, FrameIndex); 7746 7747 if (isSGPRSpill(MI)) 7748 return isSGPRStackAccess(MI, FrameIndex); 7749 7750 return Register(); 7751 } 7752 7753 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { 7754 unsigned Size = 0; 7755 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 7756 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 7757 while (++I != E && I->isInsideBundle()) { 7758 assert(!I->isBundle() && "No nested bundle!"); 7759 Size += getInstSizeInBytes(*I); 7760 } 7761 7762 return Size; 7763 } 7764 7765 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 7766 unsigned Opc = MI.getOpcode(); 7767 const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); 7768 unsigned DescSize = Desc.getSize(); 7769 7770 // If we have a definitive size, we can use it. Otherwise we need to inspect 7771 // the operands to know the size. 7772 if (isFixedSize(MI)) { 7773 unsigned Size = DescSize; 7774 7775 // If we hit the buggy offset, an extra nop will be inserted in MC so 7776 // estimate the worst case. 7777 if (MI.isBranch() && ST.hasOffset3fBug()) 7778 Size += 4; 7779 7780 return Size; 7781 } 7782 7783 // Instructions may have a 32-bit literal encoded after them. Check 7784 // operands that could ever be literals. 7785 if (isVALU(MI) || isSALU(MI)) { 7786 if (isDPP(MI)) 7787 return DescSize; 7788 bool HasLiteral = false; 7789 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { 7790 const MachineOperand &Op = MI.getOperand(I); 7791 const MCOperandInfo &OpInfo = Desc.operands()[I]; 7792 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) { 7793 HasLiteral = true; 7794 break; 7795 } 7796 } 7797 return HasLiteral ? DescSize + 4 : DescSize; 7798 } 7799 7800 // Check whether we have extra NSA words. 7801 if (isMIMG(MI)) { 7802 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 7803 if (VAddr0Idx < 0) 7804 return 8; 7805 7806 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 7807 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4); 7808 } 7809 7810 switch (Opc) { 7811 case TargetOpcode::BUNDLE: 7812 return getInstBundleSize(MI); 7813 case TargetOpcode::INLINEASM: 7814 case TargetOpcode::INLINEASM_BR: { 7815 const MachineFunction *MF = MI.getParent()->getParent(); 7816 const char *AsmStr = MI.getOperand(0).getSymbolName(); 7817 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST); 7818 } 7819 default: 7820 if (MI.isMetaInstruction()) 7821 return 0; 7822 return DescSize; 7823 } 7824 } 7825 7826 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { 7827 if (!isFLAT(MI)) 7828 return false; 7829 7830 if (MI.memoperands_empty()) 7831 return true; 7832 7833 for (const MachineMemOperand *MMO : MI.memoperands()) { 7834 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 7835 return true; 7836 } 7837 return false; 7838 } 7839 7840 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { 7841 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; 7842 } 7843 7844 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, 7845 MachineBasicBlock *IfEnd) const { 7846 MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); 7847 assert(TI != IfEntry->end()); 7848 7849 MachineInstr *Branch = &(*TI); 7850 MachineFunction *MF = IfEntry->getParent(); 7851 MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); 7852 7853 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 7854 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 7855 MachineInstr *SIIF = 7856 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) 7857 .add(Branch->getOperand(0)) 7858 .add(Branch->getOperand(1)); 7859 MachineInstr *SIEND = 7860 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) 7861 .addReg(DstReg); 7862 7863 IfEntry->erase(TI); 7864 IfEntry->insert(IfEntry->end(), SIIF); 7865 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); 7866 } 7867 } 7868 7869 void SIInstrInfo::convertNonUniformLoopRegion( 7870 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { 7871 MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); 7872 // We expect 2 terminators, one conditional and one unconditional. 7873 assert(TI != LoopEnd->end()); 7874 7875 MachineInstr *Branch = &(*TI); 7876 MachineFunction *MF = LoopEnd->getParent(); 7877 MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); 7878 7879 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { 7880 7881 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC()); 7882 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC()); 7883 MachineInstrBuilder HeaderPHIBuilder = 7884 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); 7885 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) { 7886 if (PMBB == LoopEnd) { 7887 HeaderPHIBuilder.addReg(BackEdgeReg); 7888 } else { 7889 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC()); 7890 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), 7891 ZeroReg, 0); 7892 HeaderPHIBuilder.addReg(ZeroReg); 7893 } 7894 HeaderPHIBuilder.addMBB(PMBB); 7895 } 7896 MachineInstr *HeaderPhi = HeaderPHIBuilder; 7897 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), 7898 get(AMDGPU::SI_IF_BREAK), BackEdgeReg) 7899 .addReg(DstReg) 7900 .add(Branch->getOperand(0)); 7901 MachineInstr *SILOOP = 7902 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) 7903 .addReg(BackEdgeReg) 7904 .addMBB(LoopEntry); 7905 7906 LoopEntry->insert(LoopEntry->begin(), HeaderPhi); 7907 LoopEnd->erase(TI); 7908 LoopEnd->insert(LoopEnd->end(), SIIFBREAK); 7909 LoopEnd->insert(LoopEnd->end(), SILOOP); 7910 } 7911 } 7912 7913 ArrayRef<std::pair<int, const char *>> 7914 SIInstrInfo::getSerializableTargetIndices() const { 7915 static const std::pair<int, const char *> TargetIndices[] = { 7916 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, 7917 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, 7918 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, 7919 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, 7920 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; 7921 return ArrayRef(TargetIndices); 7922 } 7923 7924 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The 7925 /// post-RA version of misched uses CreateTargetMIHazardRecognizer. 7926 ScheduleHazardRecognizer * 7927 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, 7928 const ScheduleDAG *DAG) const { 7929 return new GCNHazardRecognizer(DAG->MF); 7930 } 7931 7932 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer 7933 /// pass. 7934 ScheduleHazardRecognizer * 7935 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { 7936 return new GCNHazardRecognizer(MF); 7937 } 7938 7939 // Called during: 7940 // - pre-RA scheduling and post-RA scheduling 7941 ScheduleHazardRecognizer * 7942 SIInstrInfo::CreateTargetMIHazardRecognizer(const InstrItineraryData *II, 7943 const ScheduleDAGMI *DAG) const { 7944 // Borrowed from Arm Target 7945 // We would like to restrict this hazard recognizer to only 7946 // post-RA scheduling; we can tell that we're post-RA because we don't 7947 // track VRegLiveness. 7948 if (!DAG->hasVRegLiveness()) 7949 return new GCNHazardRecognizer(DAG->MF); 7950 return TargetInstrInfo::CreateTargetMIHazardRecognizer(II, DAG); 7951 } 7952 7953 std::pair<unsigned, unsigned> 7954 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 7955 return std::pair(TF & MO_MASK, TF & ~MO_MASK); 7956 } 7957 7958 ArrayRef<std::pair<unsigned, const char *>> 7959 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 7960 static const std::pair<unsigned, const char *> TargetFlags[] = { 7961 { MO_GOTPCREL, "amdgpu-gotprel" }, 7962 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, 7963 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, 7964 { MO_REL32_LO, "amdgpu-rel32-lo" }, 7965 { MO_REL32_HI, "amdgpu-rel32-hi" }, 7966 { MO_ABS32_LO, "amdgpu-abs32-lo" }, 7967 { MO_ABS32_HI, "amdgpu-abs32-hi" }, 7968 }; 7969 7970 return ArrayRef(TargetFlags); 7971 } 7972 7973 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 7974 SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { 7975 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 7976 { 7977 {MONoClobber, "amdgpu-noclobber"}, 7978 }; 7979 7980 return ArrayRef(TargetFlags); 7981 } 7982 7983 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { 7984 return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && 7985 MI.modifiesRegister(AMDGPU::EXEC, &RI); 7986 } 7987 7988 MachineInstrBuilder 7989 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 7990 MachineBasicBlock::iterator I, 7991 const DebugLoc &DL, 7992 Register DestReg) const { 7993 if (ST.hasAddNoCarry()) 7994 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); 7995 7996 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 7997 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC()); 7998 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC()); 7999 8000 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 8001 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 8002 } 8003 8004 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, 8005 MachineBasicBlock::iterator I, 8006 const DebugLoc &DL, 8007 Register DestReg, 8008 RegScavenger &RS) const { 8009 if (ST.hasAddNoCarry()) 8010 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg); 8011 8012 // If available, prefer to use vcc. 8013 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) 8014 ? Register(RI.getVCC()) 8015 : RS.scavengeRegisterBackwards( 8016 *RI.getBoolRC(), I, /* RestoreAfter */ false, 8017 0, /* AllowSpill */ false); 8018 8019 // TODO: Users need to deal with this. 8020 if (!UnusedCarry.isValid()) 8021 return MachineInstrBuilder(); 8022 8023 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg) 8024 .addReg(UnusedCarry, RegState::Define | RegState::Dead); 8025 } 8026 8027 bool SIInstrInfo::isKillTerminator(unsigned Opcode) { 8028 switch (Opcode) { 8029 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: 8030 case AMDGPU::SI_KILL_I1_TERMINATOR: 8031 return true; 8032 default: 8033 return false; 8034 } 8035 } 8036 8037 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { 8038 switch (Opcode) { 8039 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: 8040 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); 8041 case AMDGPU::SI_KILL_I1_PSEUDO: 8042 return get(AMDGPU::SI_KILL_I1_TERMINATOR); 8043 default: 8044 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); 8045 } 8046 } 8047 8048 unsigned SIInstrInfo::getMaxMUBUFImmOffset() { return (1 << 12) - 1; } 8049 8050 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { 8051 if (!ST.isWave32()) 8052 return; 8053 8054 if (MI.isInlineAsm()) 8055 return; 8056 8057 for (auto &Op : MI.implicit_operands()) { 8058 if (Op.isReg() && Op.getReg() == AMDGPU::VCC) 8059 Op.setReg(AMDGPU::VCC_LO); 8060 } 8061 } 8062 8063 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { 8064 if (!isSMRD(MI)) 8065 return false; 8066 8067 // Check that it is using a buffer resource. 8068 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); 8069 if (Idx == -1) // e.g. s_memtime 8070 return false; 8071 8072 const auto RCID = MI.getDesc().operands()[Idx].RegClass; 8073 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); 8074 } 8075 8076 // Given Imm, split it into the values to put into the SOffset and ImmOffset 8077 // fields in an MUBUF instruction. Return false if it is not possible (due to a 8078 // hardware bug needing a workaround). 8079 // 8080 // The required alignment ensures that individual address components remain 8081 // aligned if they are aligned to begin with. It also ensures that additional 8082 // offsets within the given alignment can be added to the resulting ImmOffset. 8083 bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, 8084 uint32_t &ImmOffset, Align Alignment) const { 8085 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); 8086 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value()); 8087 uint32_t Overflow = 0; 8088 8089 if (Imm > MaxImm) { 8090 if (Imm <= MaxImm + 64) { 8091 // Use an SOffset inline constant for 4..64 8092 Overflow = Imm - MaxImm; 8093 Imm = MaxImm; 8094 } else { 8095 // Try to keep the same value in SOffset for adjacent loads, so that 8096 // the corresponding register contents can be re-used. 8097 // 8098 // Load values with all low-bits (except for alignment bits) set into 8099 // SOffset, so that a larger range of values can be covered using 8100 // s_movk_i32. 8101 // 8102 // Atomic operations fail to work correctly when individual address 8103 // components are unaligned, even if their sum is aligned. 8104 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset; 8105 uint32_t Low = (Imm + Alignment.value()) & MaxOffset; 8106 Imm = Low; 8107 Overflow = High - Alignment.value(); 8108 } 8109 } 8110 8111 // There is a hardware bug in SI and CI which prevents address clamping in 8112 // MUBUF instructions from working correctly with SOffsets. The immediate 8113 // offset is unaffected. 8114 if (Overflow > 0 && ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) 8115 return false; 8116 8117 ImmOffset = Imm; 8118 SOffset = Overflow; 8119 return true; 8120 } 8121 8122 // Depending on the used address space and instructions, some immediate offsets 8123 // are allowed and some are not. 8124 // In general, flat instruction offsets can only be non-negative, global and 8125 // scratch instruction offsets can also be negative. 8126 // 8127 // There are several bugs related to these offsets: 8128 // On gfx10.1, flat instructions that go into the global address space cannot 8129 // use an offset. 8130 // 8131 // For scratch instructions, the address can be either an SGPR or a VGPR. 8132 // The following offsets can be used, depending on the architecture (x means 8133 // cannot be used): 8134 // +----------------------------+------+------+ 8135 // | Address-Mode | SGPR | VGPR | 8136 // +----------------------------+------+------+ 8137 // | gfx9 | | | 8138 // | negative, 4-aligned offset | x | ok | 8139 // | negative, unaligned offset | x | ok | 8140 // +----------------------------+------+------+ 8141 // | gfx10 | | | 8142 // | negative, 4-aligned offset | ok | ok | 8143 // | negative, unaligned offset | ok | x | 8144 // +----------------------------+------+------+ 8145 // | gfx10.3 | | | 8146 // | negative, 4-aligned offset | ok | ok | 8147 // | negative, unaligned offset | ok | ok | 8148 // +----------------------------+------+------+ 8149 // 8150 // This function ignores the addressing mode, so if an offset cannot be used in 8151 // one addressing mode, it is considered illegal. 8152 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, 8153 uint64_t FlatVariant) const { 8154 // TODO: Should 0 be special cased? 8155 if (!ST.hasFlatInstOffsets()) 8156 return false; 8157 8158 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT && 8159 (AddrSpace == AMDGPUAS::FLAT_ADDRESS || 8160 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS)) 8161 return false; 8162 8163 bool AllowNegative = FlatVariant != SIInstrFlags::FLAT; 8164 if (ST.hasNegativeScratchOffsetBug() && 8165 FlatVariant == SIInstrFlags::FlatScratch) 8166 AllowNegative = false; 8167 if (ST.hasNegativeUnalignedScratchOffsetBug() && 8168 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 && 8169 (Offset % 4) != 0) { 8170 return false; 8171 } 8172 8173 unsigned N = AMDGPU::getNumFlatOffsetBits(ST); 8174 return isIntN(N, Offset) && (AllowNegative || Offset >= 0); 8175 } 8176 8177 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not. 8178 std::pair<int64_t, int64_t> 8179 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, 8180 uint64_t FlatVariant) const { 8181 int64_t RemainderOffset = COffsetVal; 8182 int64_t ImmField = 0; 8183 bool AllowNegative = FlatVariant != SIInstrFlags::FLAT; 8184 if (ST.hasNegativeScratchOffsetBug() && 8185 FlatVariant == SIInstrFlags::FlatScratch) 8186 AllowNegative = false; 8187 8188 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1; 8189 if (AllowNegative) { 8190 // Use signed division by a power of two to truncate towards 0. 8191 int64_t D = 1LL << NumBits; 8192 RemainderOffset = (COffsetVal / D) * D; 8193 ImmField = COffsetVal - RemainderOffset; 8194 8195 if (ST.hasNegativeUnalignedScratchOffsetBug() && 8196 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 && 8197 (ImmField % 4) != 0) { 8198 // Make ImmField a multiple of 4 8199 RemainderOffset += ImmField % 4; 8200 ImmField -= ImmField % 4; 8201 } 8202 } else if (COffsetVal >= 0) { 8203 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits); 8204 RemainderOffset = COffsetVal - ImmField; 8205 } 8206 8207 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant)); 8208 assert(RemainderOffset + ImmField == COffsetVal); 8209 return {ImmField, RemainderOffset}; 8210 } 8211 8212 static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { 8213 switch (ST.getGeneration()) { 8214 default: 8215 break; 8216 case AMDGPUSubtarget::SOUTHERN_ISLANDS: 8217 case AMDGPUSubtarget::SEA_ISLANDS: 8218 return SIEncodingFamily::SI; 8219 case AMDGPUSubtarget::VOLCANIC_ISLANDS: 8220 case AMDGPUSubtarget::GFX9: 8221 return SIEncodingFamily::VI; 8222 case AMDGPUSubtarget::GFX10: 8223 return SIEncodingFamily::GFX10; 8224 case AMDGPUSubtarget::GFX11: 8225 return SIEncodingFamily::GFX11; 8226 } 8227 llvm_unreachable("Unknown subtarget generation!"); 8228 } 8229 8230 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { 8231 switch(MCOp) { 8232 // These opcodes use indirect register addressing so 8233 // they need special handling by codegen (currently missing). 8234 // Therefore it is too risky to allow these opcodes 8235 // to be selected by dpp combiner or sdwa peepholer. 8236 case AMDGPU::V_MOVRELS_B32_dpp_gfx10: 8237 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10: 8238 case AMDGPU::V_MOVRELD_B32_dpp_gfx10: 8239 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10: 8240 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10: 8241 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10: 8242 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10: 8243 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10: 8244 return true; 8245 default: 8246 return false; 8247 } 8248 } 8249 8250 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { 8251 unsigned Gen = subtargetEncodingFamily(ST); 8252 8253 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && 8254 ST.getGeneration() == AMDGPUSubtarget::GFX9) 8255 Gen = SIEncodingFamily::GFX9; 8256 8257 // Adjust the encoding family to GFX80 for D16 buffer instructions when the 8258 // subtarget has UnpackedD16VMem feature. 8259 // TODO: remove this when we discard GFX80 encoding. 8260 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) 8261 Gen = SIEncodingFamily::GFX80; 8262 8263 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) { 8264 switch (ST.getGeneration()) { 8265 default: 8266 Gen = SIEncodingFamily::SDWA; 8267 break; 8268 case AMDGPUSubtarget::GFX9: 8269 Gen = SIEncodingFamily::SDWA9; 8270 break; 8271 case AMDGPUSubtarget::GFX10: 8272 Gen = SIEncodingFamily::SDWA10; 8273 break; 8274 } 8275 } 8276 8277 if (isMAI(Opcode)) { 8278 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode); 8279 if (MFMAOp != -1) 8280 Opcode = MFMAOp; 8281 } 8282 8283 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); 8284 8285 // -1 means that Opcode is already a native instruction. 8286 if (MCOp == -1) 8287 return Opcode; 8288 8289 if (ST.hasGFX90AInsts()) { 8290 uint16_t NMCOp = (uint16_t)-1; 8291 if (ST.hasGFX940Insts()) 8292 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX940); 8293 if (NMCOp == (uint16_t)-1) 8294 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); 8295 if (NMCOp == (uint16_t)-1) 8296 NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); 8297 if (NMCOp != (uint16_t)-1) 8298 MCOp = NMCOp; 8299 } 8300 8301 // (uint16_t)-1 means that Opcode is a pseudo instruction that has 8302 // no encoding in the given subtarget generation. 8303 if (MCOp == (uint16_t)-1) 8304 return -1; 8305 8306 if (isAsmOnlyOpcode(MCOp)) 8307 return -1; 8308 8309 return MCOp; 8310 } 8311 8312 static 8313 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { 8314 assert(RegOpnd.isReg()); 8315 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : 8316 getRegSubRegPair(RegOpnd); 8317 } 8318 8319 TargetInstrInfo::RegSubRegPair 8320 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { 8321 assert(MI.isRegSequence()); 8322 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) 8323 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { 8324 auto &RegOp = MI.getOperand(1 + 2 * I); 8325 return getRegOrUndef(RegOp); 8326 } 8327 return TargetInstrInfo::RegSubRegPair(); 8328 } 8329 8330 // Try to find the definition of reg:subreg in subreg-manipulation pseudos 8331 // Following a subreg of reg:subreg isn't supported 8332 static bool followSubRegDef(MachineInstr &MI, 8333 TargetInstrInfo::RegSubRegPair &RSR) { 8334 if (!RSR.SubReg) 8335 return false; 8336 switch (MI.getOpcode()) { 8337 default: break; 8338 case AMDGPU::REG_SEQUENCE: 8339 RSR = getRegSequenceSubReg(MI, RSR.SubReg); 8340 return true; 8341 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg 8342 case AMDGPU::INSERT_SUBREG: 8343 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) 8344 // inserted the subreg we're looking for 8345 RSR = getRegOrUndef(MI.getOperand(2)); 8346 else { // the subreg in the rest of the reg 8347 auto R1 = getRegOrUndef(MI.getOperand(1)); 8348 if (R1.SubReg) // subreg of subreg isn't supported 8349 return false; 8350 RSR.Reg = R1.Reg; 8351 } 8352 return true; 8353 } 8354 return false; 8355 } 8356 8357 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, 8358 MachineRegisterInfo &MRI) { 8359 assert(MRI.isSSA()); 8360 if (!P.Reg.isVirtual()) 8361 return nullptr; 8362 8363 auto RSR = P; 8364 auto *DefInst = MRI.getVRegDef(RSR.Reg); 8365 while (auto *MI = DefInst) { 8366 DefInst = nullptr; 8367 switch (MI->getOpcode()) { 8368 case AMDGPU::COPY: 8369 case AMDGPU::V_MOV_B32_e32: { 8370 auto &Op1 = MI->getOperand(1); 8371 if (Op1.isReg() && Op1.getReg().isVirtual()) { 8372 if (Op1.isUndef()) 8373 return nullptr; 8374 RSR = getRegSubRegPair(Op1); 8375 DefInst = MRI.getVRegDef(RSR.Reg); 8376 } 8377 break; 8378 } 8379 default: 8380 if (followSubRegDef(*MI, RSR)) { 8381 if (!RSR.Reg) 8382 return nullptr; 8383 DefInst = MRI.getVRegDef(RSR.Reg); 8384 } 8385 } 8386 if (!DefInst) 8387 return MI; 8388 } 8389 return nullptr; 8390 } 8391 8392 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, 8393 Register VReg, 8394 const MachineInstr &DefMI, 8395 const MachineInstr &UseMI) { 8396 assert(MRI.isSSA() && "Must be run on SSA"); 8397 8398 auto *TRI = MRI.getTargetRegisterInfo(); 8399 auto *DefBB = DefMI.getParent(); 8400 8401 // Don't bother searching between blocks, although it is possible this block 8402 // doesn't modify exec. 8403 if (UseMI.getParent() != DefBB) 8404 return true; 8405 8406 const int MaxInstScan = 20; 8407 int NumInst = 0; 8408 8409 // Stop scan at the use. 8410 auto E = UseMI.getIterator(); 8411 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) { 8412 if (I->isDebugInstr()) 8413 continue; 8414 8415 if (++NumInst > MaxInstScan) 8416 return true; 8417 8418 if (I->modifiesRegister(AMDGPU::EXEC, TRI)) 8419 return true; 8420 } 8421 8422 return false; 8423 } 8424 8425 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, 8426 Register VReg, 8427 const MachineInstr &DefMI) { 8428 assert(MRI.isSSA() && "Must be run on SSA"); 8429 8430 auto *TRI = MRI.getTargetRegisterInfo(); 8431 auto *DefBB = DefMI.getParent(); 8432 8433 const int MaxUseScan = 10; 8434 int NumUse = 0; 8435 8436 for (auto &Use : MRI.use_nodbg_operands(VReg)) { 8437 auto &UseInst = *Use.getParent(); 8438 // Don't bother searching between blocks, although it is possible this block 8439 // doesn't modify exec. 8440 if (UseInst.getParent() != DefBB || UseInst.isPHI()) 8441 return true; 8442 8443 if (++NumUse > MaxUseScan) 8444 return true; 8445 } 8446 8447 if (NumUse == 0) 8448 return false; 8449 8450 const int MaxInstScan = 20; 8451 int NumInst = 0; 8452 8453 // Stop scan when we have seen all the uses. 8454 for (auto I = std::next(DefMI.getIterator()); ; ++I) { 8455 assert(I != DefBB->end()); 8456 8457 if (I->isDebugInstr()) 8458 continue; 8459 8460 if (++NumInst > MaxInstScan) 8461 return true; 8462 8463 for (const MachineOperand &Op : I->operands()) { 8464 // We don't check reg masks here as they're used only on calls: 8465 // 1. EXEC is only considered const within one BB 8466 // 2. Call should be a terminator instruction if present in a BB 8467 8468 if (!Op.isReg()) 8469 continue; 8470 8471 Register Reg = Op.getReg(); 8472 if (Op.isUse()) { 8473 if (Reg == VReg && --NumUse == 0) 8474 return false; 8475 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC)) 8476 return true; 8477 } 8478 } 8479 } 8480 8481 MachineInstr *SIInstrInfo::createPHIDestinationCopy( 8482 MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, 8483 const DebugLoc &DL, Register Src, Register Dst) const { 8484 auto Cur = MBB.begin(); 8485 if (Cur != MBB.end()) 8486 do { 8487 if (!Cur->isPHI() && Cur->readsRegister(Dst)) 8488 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); 8489 ++Cur; 8490 } while (Cur != MBB.end() && Cur != LastPHIIt); 8491 8492 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, 8493 Dst); 8494 } 8495 8496 MachineInstr *SIInstrInfo::createPHISourceCopy( 8497 MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, 8498 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const { 8499 if (InsPt != MBB.end() && 8500 (InsPt->getOpcode() == AMDGPU::SI_IF || 8501 InsPt->getOpcode() == AMDGPU::SI_ELSE || 8502 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && 8503 InsPt->definesRegister(Src)) { 8504 InsPt++; 8505 return BuildMI(MBB, InsPt, DL, 8506 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term 8507 : AMDGPU::S_MOV_B64_term), 8508 Dst) 8509 .addReg(Src, 0, SrcSubReg) 8510 .addReg(AMDGPU::EXEC, RegState::Implicit); 8511 } 8512 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, 8513 Dst); 8514 } 8515 8516 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } 8517 8518 MachineInstr *SIInstrInfo::foldMemoryOperandImpl( 8519 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 8520 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS, 8521 VirtRegMap *VRM) const { 8522 // This is a bit of a hack (copied from AArch64). Consider this instruction: 8523 // 8524 // %0:sreg_32 = COPY $m0 8525 // 8526 // We explicitly chose SReg_32 for the virtual register so such a copy might 8527 // be eliminated by RegisterCoalescer. However, that may not be possible, and 8528 // %0 may even spill. We can't spill $m0 normally (it would require copying to 8529 // a numbered SGPR anyway), and since it is in the SReg_32 register class, 8530 // TargetInstrInfo::foldMemoryOperand() is going to try. 8531 // A similar issue also exists with spilling and reloading $exec registers. 8532 // 8533 // To prevent that, constrain the %0 register class here. 8534 if (MI.isFullCopy()) { 8535 Register DstReg = MI.getOperand(0).getReg(); 8536 Register SrcReg = MI.getOperand(1).getReg(); 8537 if ((DstReg.isVirtual() || SrcReg.isVirtual()) && 8538 (DstReg.isVirtual() != SrcReg.isVirtual())) { 8539 MachineRegisterInfo &MRI = MF.getRegInfo(); 8540 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; 8541 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); 8542 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { 8543 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); 8544 return nullptr; 8545 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { 8546 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); 8547 return nullptr; 8548 } 8549 } 8550 } 8551 8552 return nullptr; 8553 } 8554 8555 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, 8556 const MachineInstr &MI, 8557 unsigned *PredCost) const { 8558 if (MI.isBundle()) { 8559 MachineBasicBlock::const_instr_iterator I(MI.getIterator()); 8560 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end()); 8561 unsigned Lat = 0, Count = 0; 8562 for (++I; I != E && I->isBundledWithPred(); ++I) { 8563 ++Count; 8564 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I)); 8565 } 8566 return Lat + Count - 1; 8567 } 8568 8569 return SchedModel.computeInstrLatency(&MI); 8570 } 8571 8572 InstructionUniformity 8573 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { 8574 unsigned opcode = MI.getOpcode(); 8575 if (opcode == AMDGPU::G_INTRINSIC || 8576 opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) { 8577 auto IID = static_cast<Intrinsic::ID>(MI.getIntrinsicID()); 8578 if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) 8579 return InstructionUniformity::NeverUniform; 8580 if (AMDGPU::isIntrinsicAlwaysUniform(IID)) 8581 return InstructionUniformity::AlwaysUniform; 8582 8583 switch (IID) { 8584 case Intrinsic::amdgcn_if: 8585 case Intrinsic::amdgcn_else: 8586 // FIXME: Uniform if second result 8587 break; 8588 } 8589 8590 return InstructionUniformity::Default; 8591 } 8592 8593 // Loads from the private and flat address spaces are divergent, because 8594 // threads can execute the load instruction with the same inputs and get 8595 // different results. 8596 // 8597 // All other loads are not divergent, because if threads issue loads with the 8598 // same arguments, they will always get the same result. 8599 if (opcode == AMDGPU::G_LOAD) { 8600 if (MI.memoperands_empty()) 8601 return InstructionUniformity::NeverUniform; // conservative assumption 8602 8603 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) { 8604 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS || 8605 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; 8606 })) { 8607 // At least one MMO in a non-global address space. 8608 return InstructionUniformity::NeverUniform; 8609 } 8610 return InstructionUniformity::Default; 8611 } 8612 8613 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) || 8614 opcode == AMDGPU::G_ATOMIC_CMPXCHG || 8615 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS) { 8616 return InstructionUniformity::NeverUniform; 8617 } 8618 return InstructionUniformity::Default; 8619 } 8620 8621 InstructionUniformity 8622 SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { 8623 8624 if (isNeverUniform(MI)) 8625 return InstructionUniformity::NeverUniform; 8626 8627 unsigned opcode = MI.getOpcode(); 8628 if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) 8629 return InstructionUniformity::AlwaysUniform; 8630 8631 if (MI.isCopy()) { 8632 const MachineOperand &srcOp = MI.getOperand(1); 8633 if (srcOp.isReg() && srcOp.getReg().isPhysical()) { 8634 const TargetRegisterClass *regClass = 8635 RI.getPhysRegBaseClass(srcOp.getReg()); 8636 return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform 8637 : InstructionUniformity::NeverUniform; 8638 } 8639 return InstructionUniformity::Default; 8640 } 8641 8642 // GMIR handling 8643 if (MI.isPreISelOpcode()) 8644 return SIInstrInfo::getGenericInstructionUniformity(MI); 8645 8646 // Atomics are divergent because they are executed sequentially: when an 8647 // atomic operation refers to the same address in each thread, then each 8648 // thread after the first sees the value written by the previous thread as 8649 // original value. 8650 8651 if (isAtomic(MI)) 8652 return InstructionUniformity::NeverUniform; 8653 8654 // Loads from the private and flat address spaces are divergent, because 8655 // threads can execute the load instruction with the same inputs and get 8656 // different results. 8657 if (isFLAT(MI) && MI.mayLoad()) { 8658 if (MI.memoperands_empty()) 8659 return InstructionUniformity::NeverUniform; // conservative assumption 8660 8661 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) { 8662 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS || 8663 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS; 8664 })) { 8665 // At least one MMO in a non-global address space. 8666 return InstructionUniformity::NeverUniform; 8667 } 8668 8669 return InstructionUniformity::Default; 8670 } 8671 8672 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 8673 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo(); 8674 8675 // FIXME: It's conceptually broken to report this for an instruction, and not 8676 // a specific def operand. For inline asm in particular, there could be mixed 8677 // uniform and divergent results. 8678 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 8679 const MachineOperand &SrcOp = MI.getOperand(I); 8680 if (!SrcOp.isReg()) 8681 continue; 8682 8683 Register Reg = SrcOp.getReg(); 8684 if (!Reg || !SrcOp.readsReg()) 8685 continue; 8686 8687 // If RegBank is null, this is unassigned or an unallocatable special 8688 // register, which are all scalars. 8689 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI); 8690 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID) 8691 return InstructionUniformity::NeverUniform; 8692 } 8693 8694 // TODO: Uniformity check condtions above can be rearranged for more 8695 // redability 8696 8697 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are 8698 // currently turned into no-op COPYs by SelectionDAG ISel and are 8699 // therefore no longer recognizable. 8700 8701 return InstructionUniformity::Default; 8702 } 8703 8704 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) { 8705 switch (MF.getFunction().getCallingConv()) { 8706 case CallingConv::AMDGPU_PS: 8707 return 1; 8708 case CallingConv::AMDGPU_VS: 8709 return 2; 8710 case CallingConv::AMDGPU_GS: 8711 return 3; 8712 case CallingConv::AMDGPU_HS: 8713 case CallingConv::AMDGPU_LS: 8714 case CallingConv::AMDGPU_ES: 8715 report_fatal_error("ds_ordered_count unsupported for this calling conv"); 8716 case CallingConv::AMDGPU_CS: 8717 case CallingConv::AMDGPU_KERNEL: 8718 case CallingConv::C: 8719 case CallingConv::Fast: 8720 default: 8721 // Assume other calling conventions are various compute callable functions 8722 return 0; 8723 } 8724 } 8725 8726 bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 8727 Register &SrcReg2, int64_t &CmpMask, 8728 int64_t &CmpValue) const { 8729 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg()) 8730 return false; 8731 8732 switch (MI.getOpcode()) { 8733 default: 8734 break; 8735 case AMDGPU::S_CMP_EQ_U32: 8736 case AMDGPU::S_CMP_EQ_I32: 8737 case AMDGPU::S_CMP_LG_U32: 8738 case AMDGPU::S_CMP_LG_I32: 8739 case AMDGPU::S_CMP_LT_U32: 8740 case AMDGPU::S_CMP_LT_I32: 8741 case AMDGPU::S_CMP_GT_U32: 8742 case AMDGPU::S_CMP_GT_I32: 8743 case AMDGPU::S_CMP_LE_U32: 8744 case AMDGPU::S_CMP_LE_I32: 8745 case AMDGPU::S_CMP_GE_U32: 8746 case AMDGPU::S_CMP_GE_I32: 8747 case AMDGPU::S_CMP_EQ_U64: 8748 case AMDGPU::S_CMP_LG_U64: 8749 SrcReg = MI.getOperand(0).getReg(); 8750 if (MI.getOperand(1).isReg()) { 8751 if (MI.getOperand(1).getSubReg()) 8752 return false; 8753 SrcReg2 = MI.getOperand(1).getReg(); 8754 CmpValue = 0; 8755 } else if (MI.getOperand(1).isImm()) { 8756 SrcReg2 = Register(); 8757 CmpValue = MI.getOperand(1).getImm(); 8758 } else { 8759 return false; 8760 } 8761 CmpMask = ~0; 8762 return true; 8763 case AMDGPU::S_CMPK_EQ_U32: 8764 case AMDGPU::S_CMPK_EQ_I32: 8765 case AMDGPU::S_CMPK_LG_U32: 8766 case AMDGPU::S_CMPK_LG_I32: 8767 case AMDGPU::S_CMPK_LT_U32: 8768 case AMDGPU::S_CMPK_LT_I32: 8769 case AMDGPU::S_CMPK_GT_U32: 8770 case AMDGPU::S_CMPK_GT_I32: 8771 case AMDGPU::S_CMPK_LE_U32: 8772 case AMDGPU::S_CMPK_LE_I32: 8773 case AMDGPU::S_CMPK_GE_U32: 8774 case AMDGPU::S_CMPK_GE_I32: 8775 SrcReg = MI.getOperand(0).getReg(); 8776 SrcReg2 = Register(); 8777 CmpValue = MI.getOperand(1).getImm(); 8778 CmpMask = ~0; 8779 return true; 8780 } 8781 8782 return false; 8783 } 8784 8785 bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, 8786 Register SrcReg2, int64_t CmpMask, 8787 int64_t CmpValue, 8788 const MachineRegisterInfo *MRI) const { 8789 if (!SrcReg || SrcReg.isPhysical()) 8790 return false; 8791 8792 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) 8793 return false; 8794 8795 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, 8796 this](int64_t ExpectedValue, unsigned SrcSize, 8797 bool IsReversible, bool IsSigned) -> bool { 8798 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8799 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8800 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8801 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n 8802 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n 8803 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8804 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8805 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8806 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n 8807 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n 8808 // 8809 // Signed ge/gt are not used for the sign bit. 8810 // 8811 // If result of the AND is unused except in the compare: 8812 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n 8813 // 8814 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n 8815 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n 8816 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n 8817 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n 8818 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n 8819 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n 8820 8821 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg); 8822 if (!Def || Def->getParent() != CmpInstr.getParent()) 8823 return false; 8824 8825 if (Def->getOpcode() != AMDGPU::S_AND_B32 && 8826 Def->getOpcode() != AMDGPU::S_AND_B64) 8827 return false; 8828 8829 int64_t Mask; 8830 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool { 8831 if (MO->isImm()) 8832 Mask = MO->getImm(); 8833 else if (!getFoldableImm(MO, Mask)) 8834 return false; 8835 Mask &= maxUIntN(SrcSize); 8836 return isPowerOf2_64(Mask); 8837 }; 8838 8839 MachineOperand *SrcOp = &Def->getOperand(1); 8840 if (isMask(SrcOp)) 8841 SrcOp = &Def->getOperand(2); 8842 else if (isMask(&Def->getOperand(2))) 8843 SrcOp = &Def->getOperand(1); 8844 else 8845 return false; 8846 8847 unsigned BitNo = llvm::countr_zero((uint64_t)Mask); 8848 if (IsSigned && BitNo == SrcSize - 1) 8849 return false; 8850 8851 ExpectedValue <<= BitNo; 8852 8853 bool IsReversedCC = false; 8854 if (CmpValue != ExpectedValue) { 8855 if (!IsReversible) 8856 return false; 8857 IsReversedCC = CmpValue == (ExpectedValue ^ Mask); 8858 if (!IsReversedCC) 8859 return false; 8860 } 8861 8862 Register DefReg = Def->getOperand(0).getReg(); 8863 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) 8864 return false; 8865 8866 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator(); 8867 I != E; ++I) { 8868 if (I->modifiesRegister(AMDGPU::SCC, &RI) || 8869 I->killsRegister(AMDGPU::SCC, &RI)) 8870 return false; 8871 } 8872 8873 MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC); 8874 SccDef->setIsDead(false); 8875 CmpInstr.eraseFromParent(); 8876 8877 if (!MRI->use_nodbg_empty(DefReg)) { 8878 assert(!IsReversedCC); 8879 return true; 8880 } 8881 8882 // Replace AND with unused result with a S_BITCMP. 8883 MachineBasicBlock *MBB = Def->getParent(); 8884 8885 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32 8886 : AMDGPU::S_BITCMP1_B32 8887 : IsReversedCC ? AMDGPU::S_BITCMP0_B64 8888 : AMDGPU::S_BITCMP1_B64; 8889 8890 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc)) 8891 .add(*SrcOp) 8892 .addImm(BitNo); 8893 Def->eraseFromParent(); 8894 8895 return true; 8896 }; 8897 8898 switch (CmpInstr.getOpcode()) { 8899 default: 8900 break; 8901 case AMDGPU::S_CMP_EQ_U32: 8902 case AMDGPU::S_CMP_EQ_I32: 8903 case AMDGPU::S_CMPK_EQ_U32: 8904 case AMDGPU::S_CMPK_EQ_I32: 8905 return optimizeCmpAnd(1, 32, true, false); 8906 case AMDGPU::S_CMP_GE_U32: 8907 case AMDGPU::S_CMPK_GE_U32: 8908 return optimizeCmpAnd(1, 32, false, false); 8909 case AMDGPU::S_CMP_GE_I32: 8910 case AMDGPU::S_CMPK_GE_I32: 8911 return optimizeCmpAnd(1, 32, false, true); 8912 case AMDGPU::S_CMP_EQ_U64: 8913 return optimizeCmpAnd(1, 64, true, false); 8914 case AMDGPU::S_CMP_LG_U32: 8915 case AMDGPU::S_CMP_LG_I32: 8916 case AMDGPU::S_CMPK_LG_U32: 8917 case AMDGPU::S_CMPK_LG_I32: 8918 return optimizeCmpAnd(0, 32, true, false); 8919 case AMDGPU::S_CMP_GT_U32: 8920 case AMDGPU::S_CMPK_GT_U32: 8921 return optimizeCmpAnd(0, 32, false, false); 8922 case AMDGPU::S_CMP_GT_I32: 8923 case AMDGPU::S_CMPK_GT_I32: 8924 return optimizeCmpAnd(0, 32, false, true); 8925 case AMDGPU::S_CMP_LG_U64: 8926 return optimizeCmpAnd(0, 64, true, false); 8927 } 8928 8929 return false; 8930 } 8931 8932 void SIInstrInfo::enforceOperandRCAlignment(MachineInstr &MI, 8933 unsigned OpName) const { 8934 if (!ST.needsAlignedVGPRs()) 8935 return; 8936 8937 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); 8938 if (OpNo < 0) 8939 return; 8940 MachineOperand &Op = MI.getOperand(OpNo); 8941 if (getOpSize(MI, OpNo) > 4) 8942 return; 8943 8944 // Add implicit aligned super-reg to force alignment on the data operand. 8945 const DebugLoc &DL = MI.getDebugLoc(); 8946 MachineBasicBlock *BB = MI.getParent(); 8947 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 8948 Register DataReg = Op.getReg(); 8949 bool IsAGPR = RI.isAGPR(MRI, DataReg); 8950 Register Undef = MRI.createVirtualRegister( 8951 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); 8952 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef); 8953 Register NewVR = 8954 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass 8955 : &AMDGPU::VReg_64_Align2RegClass); 8956 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR) 8957 .addReg(DataReg, 0, Op.getSubReg()) 8958 .addImm(AMDGPU::sub0) 8959 .addReg(Undef) 8960 .addImm(AMDGPU::sub1); 8961 Op.setReg(NewVR); 8962 Op.setSubReg(AMDGPU::sub0); 8963 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); 8964 } 8965