1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 unsigned CPol = 0; 108 bool IsAGPR; 109 bool UseST64; 110 int AddrIdx[MaxAddressRegs]; 111 const MachineOperand *AddrReg[MaxAddressRegs]; 112 unsigned NumAddresses; 113 unsigned Order; 114 115 bool hasSameBaseAddress(const MachineInstr &MI) { 116 for (unsigned i = 0; i < NumAddresses; i++) { 117 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 118 119 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 120 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 121 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 122 return false; 123 } 124 continue; 125 } 126 127 // Check same base pointer. Be careful of subregisters, which can occur 128 // with vectors of pointers. 129 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 130 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 131 return false; 132 } 133 } 134 return true; 135 } 136 137 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 138 for (unsigned i = 0; i < NumAddresses; ++i) { 139 const MachineOperand *AddrOp = AddrReg[i]; 140 // Immediates are always OK. 141 if (AddrOp->isImm()) 142 continue; 143 144 // Don't try to merge addresses that aren't either immediates or registers. 145 // TODO: Should be possible to merge FrameIndexes and maybe some other 146 // non-register 147 if (!AddrOp->isReg()) 148 return false; 149 150 // TODO: We should be able to merge physical reg addresses. 151 if (AddrOp->getReg().isPhysical()) 152 return false; 153 154 // If an address has only one use then there will be on other 155 // instructions with the same address, so we can't merge this one. 156 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 157 return false; 158 } 159 return true; 160 } 161 162 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 163 }; 164 165 struct BaseRegisters { 166 Register LoReg; 167 Register HiReg; 168 169 unsigned LoSubReg = 0; 170 unsigned HiSubReg = 0; 171 }; 172 173 struct MemAddress { 174 BaseRegisters Base; 175 int64_t Offset = 0; 176 }; 177 178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 179 180 private: 181 const GCNSubtarget *STM = nullptr; 182 const SIInstrInfo *TII = nullptr; 183 const SIRegisterInfo *TRI = nullptr; 184 MachineRegisterInfo *MRI = nullptr; 185 AliasAnalysis *AA = nullptr; 186 bool OptimizeAgain; 187 188 static bool dmasksCanBeCombined(const CombineInfo &CI, 189 const SIInstrInfo &TII, 190 const CombineInfo &Paired); 191 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 192 CombineInfo &Paired, bool Modify = false); 193 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 194 const CombineInfo &Paired); 195 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 196 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 197 const CombineInfo &Paired); 198 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 199 const CombineInfo &Paired); 200 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 201 202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 203 SmallVectorImpl<MachineInstr *> &InstsToMove); 204 205 unsigned read2Opcode(unsigned EltSize) const; 206 unsigned read2ST64Opcode(unsigned EltSize) const; 207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 208 CombineInfo &Paired, 209 const SmallVectorImpl<MachineInstr *> &InstsToMove); 210 211 unsigned write2Opcode(unsigned EltSize) const; 212 unsigned write2ST64Opcode(unsigned EltSize) const; 213 MachineBasicBlock::iterator 214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 215 const SmallVectorImpl<MachineInstr *> &InstsToMove); 216 MachineBasicBlock::iterator 217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 218 const SmallVectorImpl<MachineInstr *> &InstsToMove); 219 MachineBasicBlock::iterator 220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 221 const SmallVectorImpl<MachineInstr *> &InstsToMove); 222 MachineBasicBlock::iterator 223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 224 const SmallVectorImpl<MachineInstr *> &InstsToMove); 225 MachineBasicBlock::iterator 226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 227 const SmallVectorImpl<MachineInstr *> &InstsToMove); 228 MachineBasicBlock::iterator 229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 230 const SmallVectorImpl<MachineInstr *> &InstsToMove); 231 MachineBasicBlock::iterator 232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 233 const SmallVectorImpl<MachineInstr *> &InstsToMove); 234 235 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 236 int32_t NewOffset) const; 237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 241 /// Promotes constant offset to the immediate by adjusting the base. It 242 /// tries to use a base from the nearby instructions that allows it to have 243 /// a 13bit constant offset which gets promoted to the immediate. 244 bool promoteConstantOffsetToImm(MachineInstr &CI, 245 MemInfoMap &Visited, 246 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 247 void addInstToMergeableList(const CombineInfo &CI, 248 std::list<std::list<CombineInfo> > &MergeableInsts) const; 249 250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 253 std::list<std::list<CombineInfo>> &MergeableInsts) const; 254 255 public: 256 static char ID; 257 258 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 260 } 261 262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 263 bool &OptimizeListAgain); 264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 265 266 bool runOnMachineFunction(MachineFunction &MF) override; 267 268 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 269 270 void getAnalysisUsage(AnalysisUsage &AU) const override { 271 AU.setPreservesCFG(); 272 AU.addRequired<AAResultsWrapperPass>(); 273 274 MachineFunctionPass::getAnalysisUsage(AU); 275 } 276 277 MachineFunctionProperties getRequiredProperties() const override { 278 return MachineFunctionProperties() 279 .set(MachineFunctionProperties::Property::IsSSA); 280 } 281 }; 282 283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 284 const unsigned Opc = MI.getOpcode(); 285 286 if (TII.isMUBUF(Opc)) { 287 // FIXME: Handle d16 correctly 288 return AMDGPU::getMUBUFElements(Opc); 289 } 290 if (TII.isMIMG(MI)) { 291 uint64_t DMaskImm = 292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 293 return countPopulation(DMaskImm); 294 } 295 if (TII.isMTBUF(Opc)) { 296 return AMDGPU::getMTBUFElements(Opc); 297 } 298 299 switch (Opc) { 300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 301 return 1; 302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 303 return 2; 304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 305 return 4; 306 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 307 return 8; 308 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 309 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 310 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 311 case AMDGPU::DS_WRITE_B32_gfx9: 312 return 1; 313 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 314 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 315 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 316 case AMDGPU::DS_WRITE_B64_gfx9: 317 return 2; 318 default: 319 return 0; 320 } 321 } 322 323 /// Maps instruction opcode to enum InstClassEnum. 324 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 325 switch (Opc) { 326 default: 327 if (TII.isMUBUF(Opc)) { 328 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 329 default: 330 return UNKNOWN; 331 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 332 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 334 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 335 return BUFFER_LOAD; 336 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 337 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 338 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 339 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 340 return BUFFER_STORE; 341 } 342 } 343 if (TII.isMIMG(Opc)) { 344 // Ignore instructions encoded without vaddr. 345 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 346 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 347 return UNKNOWN; 348 // Ignore BVH instructions 349 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 350 return UNKNOWN; 351 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 352 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 353 TII.isGather4(Opc)) 354 return UNKNOWN; 355 return MIMG; 356 } 357 if (TII.isMTBUF(Opc)) { 358 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 359 default: 360 return UNKNOWN; 361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 363 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 364 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 365 return TBUFFER_LOAD; 366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 368 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 369 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 370 return TBUFFER_STORE; 371 } 372 } 373 return UNKNOWN; 374 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 375 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 377 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 378 return S_BUFFER_LOAD_IMM; 379 case AMDGPU::DS_READ_B32: 380 case AMDGPU::DS_READ_B32_gfx9: 381 case AMDGPU::DS_READ_B64: 382 case AMDGPU::DS_READ_B64_gfx9: 383 return DS_READ; 384 case AMDGPU::DS_WRITE_B32: 385 case AMDGPU::DS_WRITE_B32_gfx9: 386 case AMDGPU::DS_WRITE_B64: 387 case AMDGPU::DS_WRITE_B64_gfx9: 388 return DS_WRITE; 389 } 390 } 391 392 /// Determines instruction subclass from opcode. Only instructions 393 /// of the same subclass can be merged together. 394 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 395 switch (Opc) { 396 default: 397 if (TII.isMUBUF(Opc)) 398 return AMDGPU::getMUBUFBaseOpcode(Opc); 399 if (TII.isMIMG(Opc)) { 400 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 401 assert(Info); 402 return Info->BaseOpcode; 403 } 404 if (TII.isMTBUF(Opc)) 405 return AMDGPU::getMTBUFBaseOpcode(Opc); 406 return -1; 407 case AMDGPU::DS_READ_B32: 408 case AMDGPU::DS_READ_B32_gfx9: 409 case AMDGPU::DS_READ_B64: 410 case AMDGPU::DS_READ_B64_gfx9: 411 case AMDGPU::DS_WRITE_B32: 412 case AMDGPU::DS_WRITE_B32_gfx9: 413 case AMDGPU::DS_WRITE_B64: 414 case AMDGPU::DS_WRITE_B64_gfx9: 415 return Opc; 416 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 417 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 418 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 419 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 420 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 421 } 422 } 423 424 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 425 AddressRegs Result; 426 427 if (TII.isMUBUF(Opc)) { 428 if (AMDGPU::getMUBUFHasVAddr(Opc)) 429 Result.VAddr = true; 430 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 431 Result.SRsrc = true; 432 if (AMDGPU::getMUBUFHasSoffset(Opc)) 433 Result.SOffset = true; 434 435 return Result; 436 } 437 438 if (TII.isMIMG(Opc)) { 439 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 440 if (VAddr0Idx >= 0) { 441 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 442 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 443 } else { 444 Result.VAddr = true; 445 } 446 Result.SRsrc = true; 447 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 448 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 449 Result.SSamp = true; 450 451 return Result; 452 } 453 if (TII.isMTBUF(Opc)) { 454 if (AMDGPU::getMTBUFHasVAddr(Opc)) 455 Result.VAddr = true; 456 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 457 Result.SRsrc = true; 458 if (AMDGPU::getMTBUFHasSoffset(Opc)) 459 Result.SOffset = true; 460 461 return Result; 462 } 463 464 switch (Opc) { 465 default: 466 return Result; 467 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 468 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 469 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 470 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 471 Result.SBase = true; 472 return Result; 473 case AMDGPU::DS_READ_B32: 474 case AMDGPU::DS_READ_B64: 475 case AMDGPU::DS_READ_B32_gfx9: 476 case AMDGPU::DS_READ_B64_gfx9: 477 case AMDGPU::DS_WRITE_B32: 478 case AMDGPU::DS_WRITE_B64: 479 case AMDGPU::DS_WRITE_B32_gfx9: 480 case AMDGPU::DS_WRITE_B64_gfx9: 481 Result.Addr = true; 482 return Result; 483 } 484 } 485 486 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 487 const SILoadStoreOptimizer &LSO) { 488 I = MI; 489 unsigned Opc = MI->getOpcode(); 490 InstClass = getInstClass(Opc, *LSO.TII); 491 492 if (InstClass == UNKNOWN) 493 return; 494 495 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 496 497 switch (InstClass) { 498 case DS_READ: 499 EltSize = 500 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 501 : 4; 502 break; 503 case DS_WRITE: 504 EltSize = 505 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 506 : 4; 507 break; 508 case S_BUFFER_LOAD_IMM: 509 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 510 break; 511 default: 512 EltSize = 4; 513 break; 514 } 515 516 if (InstClass == MIMG) { 517 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 518 // Offset is not considered for MIMG instructions. 519 Offset = 0; 520 } else { 521 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 522 Offset = I->getOperand(OffsetIdx).getImm(); 523 } 524 525 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 526 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 527 528 Width = getOpcodeWidth(*I, *LSO.TII); 529 530 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 531 Offset &= 0xffff; 532 } else if (InstClass != MIMG) { 533 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 534 } 535 536 AddressRegs Regs = getRegs(Opc, *LSO.TII); 537 538 NumAddresses = 0; 539 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 540 AddrIdx[NumAddresses++] = 541 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 542 if (Regs.Addr) 543 AddrIdx[NumAddresses++] = 544 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 545 if (Regs.SBase) 546 AddrIdx[NumAddresses++] = 547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 548 if (Regs.SRsrc) 549 AddrIdx[NumAddresses++] = 550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 551 if (Regs.SOffset) 552 AddrIdx[NumAddresses++] = 553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 554 if (Regs.VAddr) 555 AddrIdx[NumAddresses++] = 556 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 557 if (Regs.SSamp) 558 AddrIdx[NumAddresses++] = 559 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 560 assert(NumAddresses <= MaxAddressRegs); 561 562 for (unsigned J = 0; J < NumAddresses; J++) 563 AddrReg[J] = &I->getOperand(AddrIdx[J]); 564 } 565 566 } // end anonymous namespace. 567 568 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 569 "SI Load Store Optimizer", false, false) 570 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 571 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 572 false, false) 573 574 char SILoadStoreOptimizer::ID = 0; 575 576 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 577 578 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 579 return new SILoadStoreOptimizer(); 580 } 581 582 static void moveInstsAfter(MachineBasicBlock::iterator I, 583 ArrayRef<MachineInstr *> InstsToMove) { 584 MachineBasicBlock *MBB = I->getParent(); 585 ++I; 586 for (MachineInstr *MI : InstsToMove) { 587 MI->removeFromParent(); 588 MBB->insert(I, MI); 589 } 590 } 591 592 static void addDefsUsesToList(const MachineInstr &MI, 593 DenseSet<Register> &RegDefs, 594 DenseSet<Register> &PhysRegUses) { 595 for (const MachineOperand &Op : MI.operands()) { 596 if (Op.isReg()) { 597 if (Op.isDef()) 598 RegDefs.insert(Op.getReg()); 599 else if (Op.readsReg() && Op.getReg().isPhysical()) 600 PhysRegUses.insert(Op.getReg()); 601 } 602 } 603 } 604 605 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 606 MachineBasicBlock::iterator B, 607 AliasAnalysis *AA) { 608 // RAW or WAR - cannot reorder 609 // WAW - cannot reorder 610 // RAR - safe to reorder 611 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 612 } 613 614 // Add MI and its defs to the lists if MI reads one of the defs that are 615 // already in the list. Returns true in that case. 616 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 617 DenseSet<Register> &PhysRegUses, 618 SmallVectorImpl<MachineInstr *> &Insts) { 619 for (MachineOperand &Use : MI.operands()) { 620 // If one of the defs is read, then there is a use of Def between I and the 621 // instruction that I will potentially be merged with. We will need to move 622 // this instruction after the merged instructions. 623 // 624 // Similarly, if there is a def which is read by an instruction that is to 625 // be moved for merging, then we need to move the def-instruction as well. 626 // This can only happen for physical registers such as M0; virtual 627 // registers are in SSA form. 628 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 629 (Use.isDef() && RegDefs.count(Use.getReg())) || 630 (Use.isDef() && Use.getReg().isPhysical() && 631 PhysRegUses.count(Use.getReg())))) { 632 Insts.push_back(&MI); 633 addDefsUsesToList(MI, RegDefs, PhysRegUses); 634 return true; 635 } 636 } 637 638 return false; 639 } 640 641 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 642 ArrayRef<MachineInstr *> InstsToMove, 643 AliasAnalysis *AA) { 644 assert(MemOp.mayLoadOrStore()); 645 646 for (MachineInstr *InstToMove : InstsToMove) { 647 if (!InstToMove->mayLoadOrStore()) 648 continue; 649 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 650 return false; 651 } 652 return true; 653 } 654 655 // This function assumes that \p A and \p B have are identical except for 656 // size and offset, and they reference adjacent memory. 657 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 658 const MachineMemOperand *A, 659 const MachineMemOperand *B) { 660 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 661 unsigned Size = A->getSize() + B->getSize(); 662 // This function adds the offset parameter to the existing offset for A, 663 // so we pass 0 here as the offset and then manually set it to the correct 664 // value after the call. 665 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 666 MMO->setOffset(MinOffset); 667 return MMO; 668 } 669 670 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 671 const SIInstrInfo &TII, 672 const CombineInfo &Paired) { 673 assert(CI.InstClass == MIMG); 674 675 // Ignore instructions with tfe/lwe set. 676 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 677 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 678 679 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 680 return false; 681 682 // Check other optional immediate operands for equality. 683 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 684 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 685 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 686 687 for (auto op : OperandsToMatch) { 688 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 689 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 690 return false; 691 if (Idx != -1 && 692 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 693 return false; 694 } 695 696 // Check DMask for overlaps. 697 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 698 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 699 700 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 701 if ((1u << AllowedBitsForMin) <= MinMask) 702 return false; 703 704 return true; 705 } 706 707 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 708 unsigned ComponentCount, 709 const GCNSubtarget &STI) { 710 if (ComponentCount > 4) 711 return 0; 712 713 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 714 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 715 if (!OldFormatInfo) 716 return 0; 717 718 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 719 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 720 ComponentCount, 721 OldFormatInfo->NumFormat, STI); 722 723 if (!NewFormatInfo) 724 return 0; 725 726 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 727 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 728 729 return NewFormatInfo->Format; 730 } 731 732 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 733 // highest power of two. Note that the result is well defined for all inputs 734 // including corner cases like: 735 // - if Lo == Hi, return that value 736 // - if Lo == 0, return 0 (even though the "- 1" below underflows 737 // - if Lo > Hi, return 0 (as if the range wrapped around) 738 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 739 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 740 } 741 742 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 743 const GCNSubtarget &STI, 744 CombineInfo &Paired, 745 bool Modify) { 746 assert(CI.InstClass != MIMG); 747 748 // XXX - Would the same offset be OK? Is there any reason this would happen or 749 // be useful? 750 if (CI.Offset == Paired.Offset) 751 return false; 752 753 // This won't be valid if the offset isn't aligned. 754 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 755 return false; 756 757 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 758 759 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 760 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 761 if (!Info0) 762 return false; 763 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 764 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 765 if (!Info1) 766 return false; 767 768 if (Info0->BitsPerComp != Info1->BitsPerComp || 769 Info0->NumFormat != Info1->NumFormat) 770 return false; 771 772 // TODO: Should be possible to support more formats, but if format loads 773 // are not dword-aligned, the merged load might not be valid. 774 if (Info0->BitsPerComp != 32) 775 return false; 776 777 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 778 return false; 779 } 780 781 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 782 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 783 CI.UseST64 = false; 784 CI.BaseOff = 0; 785 786 // Handle all non-DS instructions. 787 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 788 return (EltOffset0 + CI.Width == EltOffset1 || 789 EltOffset1 + Paired.Width == EltOffset0) && 790 CI.CPol == Paired.CPol && 791 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); 792 } 793 794 // If the offset in elements doesn't fit in 8-bits, we might be able to use 795 // the stride 64 versions. 796 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 797 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 798 if (Modify) { 799 CI.Offset = EltOffset0 / 64; 800 Paired.Offset = EltOffset1 / 64; 801 CI.UseST64 = true; 802 } 803 return true; 804 } 805 806 // Check if the new offsets fit in the reduced 8-bit range. 807 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 808 if (Modify) { 809 CI.Offset = EltOffset0; 810 Paired.Offset = EltOffset1; 811 } 812 return true; 813 } 814 815 // Try to shift base address to decrease offsets. 816 uint32_t Min = std::min(EltOffset0, EltOffset1); 817 uint32_t Max = std::max(EltOffset0, EltOffset1); 818 819 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 820 if (((Max - Min) & ~Mask) == 0) { 821 if (Modify) { 822 // From the range of values we could use for BaseOff, choose the one that 823 // is aligned to the highest power of two, to maximise the chance that 824 // the same offset can be reused for other load/store pairs. 825 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 826 // Copy the low bits of the offsets, so that when we adjust them by 827 // subtracting BaseOff they will be multiples of 64. 828 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 829 CI.BaseOff = BaseOff * CI.EltSize; 830 CI.Offset = (EltOffset0 - BaseOff) / 64; 831 Paired.Offset = (EltOffset1 - BaseOff) / 64; 832 CI.UseST64 = true; 833 } 834 return true; 835 } 836 837 if (isUInt<8>(Max - Min)) { 838 if (Modify) { 839 // From the range of values we could use for BaseOff, choose the one that 840 // is aligned to the highest power of two, to maximise the chance that 841 // the same offset can be reused for other load/store pairs. 842 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 843 CI.BaseOff = BaseOff * CI.EltSize; 844 CI.Offset = EltOffset0 - BaseOff; 845 Paired.Offset = EltOffset1 - BaseOff; 846 } 847 return true; 848 } 849 850 return false; 851 } 852 853 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 854 const CombineInfo &CI, 855 const CombineInfo &Paired) { 856 const unsigned Width = (CI.Width + Paired.Width); 857 switch (CI.InstClass) { 858 default: 859 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 860 case S_BUFFER_LOAD_IMM: 861 switch (Width) { 862 default: 863 return false; 864 case 2: 865 case 4: 866 case 8: 867 return true; 868 } 869 } 870 } 871 872 const TargetRegisterClass * 873 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 874 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 875 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 876 } 877 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 878 return TRI->getRegClassForReg(*MRI, Src->getReg()); 879 } 880 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 881 return TRI->getRegClassForReg(*MRI, Src->getReg()); 882 } 883 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 884 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 885 } 886 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 887 return TRI->getRegClassForReg(*MRI, Src->getReg()); 888 } 889 return nullptr; 890 } 891 892 /// This function assumes that CI comes before Paired in a basic block. 893 bool SILoadStoreOptimizer::checkAndPrepareMerge( 894 CombineInfo &CI, CombineInfo &Paired, 895 SmallVectorImpl<MachineInstr *> &InstsToMove) { 896 897 // Check both offsets (or masks for MIMG) can be combined and fit in the 898 // reduced range. 899 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 900 return false; 901 902 if (CI.InstClass != MIMG && 903 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 904 return false; 905 906 const unsigned Opc = CI.I->getOpcode(); 907 const InstClassEnum InstClass = getInstClass(Opc, *TII); 908 909 if (InstClass == UNKNOWN) { 910 return false; 911 } 912 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 913 914 DenseSet<Register> RegDefsToMove; 915 DenseSet<Register> PhysRegUsesToMove; 916 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 917 918 MachineBasicBlock::iterator E = std::next(Paired.I); 919 MachineBasicBlock::iterator MBBI = std::next(CI.I); 920 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 921 for (; MBBI != E; ++MBBI) { 922 923 if (MBBI == MBBE) { 924 // CombineInfo::Order is a hint on the instruction ordering within the 925 // basic block. This hint suggests that CI precedes Paired, which is 926 // true most of the time. However, moveInstsAfter() processing a 927 // previous list may have changed this order in a situation when it 928 // moves an instruction which exists in some other merge list. 929 // In this case it must be dependent. 930 return false; 931 } 932 933 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 934 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 935 // This is not a matching instruction, but we can keep looking as 936 // long as one of these conditions are met: 937 // 1. It is safe to move I down past MBBI. 938 // 2. It is safe to move MBBI down past the instruction that I will 939 // be merged into. 940 941 if (MBBI->mayLoadOrStore() && 942 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 943 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 944 // We fail condition #1, but we may still be able to satisfy condition 945 // #2. Add this instruction to the move list and then we will check 946 // if condition #2 holds once we have selected the matching instruction. 947 InstsToMove.push_back(&*MBBI); 948 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 949 continue; 950 } 951 952 // When we match I with another DS instruction we will be moving I down 953 // to the location of the matched instruction any uses of I will need to 954 // be moved down as well. 955 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 956 InstsToMove); 957 continue; 958 } 959 960 // Handle a case like 961 // DS_WRITE_B32 addr, v, idx0 962 // w = DS_READ_B32 addr, idx0 963 // DS_WRITE_B32 addr, f(w), idx1 964 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 965 // merging of the two writes. 966 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 967 InstsToMove)) 968 continue; 969 970 if (&*MBBI == &*Paired.I) { 971 // We need to go through the list of instructions that we plan to 972 // move and make sure they are all safe to move down past the merged 973 // instruction. 974 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 975 976 // Call offsetsCanBeCombined with modify = true so that the offsets are 977 // correct for the new instruction. This should return true, because 978 // this function should only be called on CombineInfo objects that 979 // have already been confirmed to be mergeable. 980 if (CI.InstClass != MIMG) 981 offsetsCanBeCombined(CI, *STM, Paired, true); 982 return true; 983 } 984 return false; 985 } 986 987 // We've found a load/store that we couldn't merge for some reason. 988 // We could potentially keep looking, but we'd need to make sure that 989 // it was safe to move I and also all the instruction in InstsToMove 990 // down past this instruction. 991 // check if we can move I across MBBI and if we can move all I's users 992 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 993 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 994 break; 995 } 996 return false; 997 } 998 999 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1000 if (STM->ldsRequiresM0Init()) 1001 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1002 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1003 } 1004 1005 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1006 if (STM->ldsRequiresM0Init()) 1007 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1008 1009 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1010 : AMDGPU::DS_READ2ST64_B64_gfx9; 1011 } 1012 1013 MachineBasicBlock::iterator 1014 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1015 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1016 MachineBasicBlock *MBB = CI.I->getParent(); 1017 1018 // Be careful, since the addresses could be subregisters themselves in weird 1019 // cases, like vectors of pointers. 1020 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1021 1022 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1023 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1024 1025 unsigned NewOffset0 = CI.Offset; 1026 unsigned NewOffset1 = Paired.Offset; 1027 unsigned Opc = 1028 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1029 1030 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1031 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1032 1033 if (NewOffset0 > NewOffset1) { 1034 // Canonicalize the merged instruction so the smaller offset comes first. 1035 std::swap(NewOffset0, NewOffset1); 1036 std::swap(SubRegIdx0, SubRegIdx1); 1037 } 1038 1039 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1040 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1041 1042 const MCInstrDesc &Read2Desc = TII->get(Opc); 1043 1044 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1045 Register DestReg = MRI->createVirtualRegister(SuperRC); 1046 1047 DebugLoc DL = CI.I->getDebugLoc(); 1048 1049 Register BaseReg = AddrReg->getReg(); 1050 unsigned BaseSubReg = AddrReg->getSubReg(); 1051 unsigned BaseRegFlags = 0; 1052 if (CI.BaseOff) { 1053 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1054 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1055 .addImm(CI.BaseOff); 1056 1057 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1058 BaseRegFlags = RegState::Kill; 1059 1060 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1061 .addReg(ImmReg) 1062 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1063 .addImm(0); // clamp bit 1064 BaseSubReg = 0; 1065 } 1066 1067 MachineInstrBuilder Read2 = 1068 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1069 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1070 .addImm(NewOffset0) // offset0 1071 .addImm(NewOffset1) // offset1 1072 .addImm(0) // gds 1073 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1074 1075 (void)Read2; 1076 1077 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1078 1079 // Copy to the old destination registers. 1080 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1081 .add(*Dest0) // Copy to same destination including flags and sub reg. 1082 .addReg(DestReg, 0, SubRegIdx0); 1083 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1084 .add(*Dest1) 1085 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1086 1087 moveInstsAfter(Copy1, InstsToMove); 1088 1089 CI.I->eraseFromParent(); 1090 Paired.I->eraseFromParent(); 1091 1092 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1093 return Read2; 1094 } 1095 1096 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1097 if (STM->ldsRequiresM0Init()) 1098 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1099 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1100 : AMDGPU::DS_WRITE2_B64_gfx9; 1101 } 1102 1103 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1104 if (STM->ldsRequiresM0Init()) 1105 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1106 : AMDGPU::DS_WRITE2ST64_B64; 1107 1108 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1109 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1110 } 1111 1112 MachineBasicBlock::iterator 1113 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1114 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1115 MachineBasicBlock *MBB = CI.I->getParent(); 1116 1117 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1118 // sure we preserve the subregister index and any register flags set on them. 1119 const MachineOperand *AddrReg = 1120 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1121 const MachineOperand *Data0 = 1122 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1123 const MachineOperand *Data1 = 1124 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1125 1126 unsigned NewOffset0 = CI.Offset; 1127 unsigned NewOffset1 = Paired.Offset; 1128 unsigned Opc = 1129 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1130 1131 if (NewOffset0 > NewOffset1) { 1132 // Canonicalize the merged instruction so the smaller offset comes first. 1133 std::swap(NewOffset0, NewOffset1); 1134 std::swap(Data0, Data1); 1135 } 1136 1137 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1138 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1139 1140 const MCInstrDesc &Write2Desc = TII->get(Opc); 1141 DebugLoc DL = CI.I->getDebugLoc(); 1142 1143 Register BaseReg = AddrReg->getReg(); 1144 unsigned BaseSubReg = AddrReg->getSubReg(); 1145 unsigned BaseRegFlags = 0; 1146 if (CI.BaseOff) { 1147 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1148 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1149 .addImm(CI.BaseOff); 1150 1151 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1152 BaseRegFlags = RegState::Kill; 1153 1154 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1155 .addReg(ImmReg) 1156 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1157 .addImm(0); // clamp bit 1158 BaseSubReg = 0; 1159 } 1160 1161 MachineInstrBuilder Write2 = 1162 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1163 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1164 .add(*Data0) // data0 1165 .add(*Data1) // data1 1166 .addImm(NewOffset0) // offset0 1167 .addImm(NewOffset1) // offset1 1168 .addImm(0) // gds 1169 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1170 1171 moveInstsAfter(Write2, InstsToMove); 1172 1173 CI.I->eraseFromParent(); 1174 Paired.I->eraseFromParent(); 1175 1176 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1177 return Write2; 1178 } 1179 1180 MachineBasicBlock::iterator 1181 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1182 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1183 MachineBasicBlock *MBB = CI.I->getParent(); 1184 DebugLoc DL = CI.I->getDebugLoc(); 1185 const unsigned Opcode = getNewOpcode(CI, Paired); 1186 1187 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1188 1189 Register DestReg = MRI->createVirtualRegister(SuperRC); 1190 unsigned MergedDMask = CI.DMask | Paired.DMask; 1191 unsigned DMaskIdx = 1192 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1193 1194 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1195 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1196 if (I == DMaskIdx) 1197 MIB.addImm(MergedDMask); 1198 else 1199 MIB.add((*CI.I).getOperand(I)); 1200 } 1201 1202 // It shouldn't be possible to get this far if the two instructions 1203 // don't have a single memoperand, because MachineInstr::mayAlias() 1204 // will return true if this is the case. 1205 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1206 1207 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1208 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1209 1210 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1211 1212 unsigned SubRegIdx0, SubRegIdx1; 1213 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1214 1215 // Copy to the old destination registers. 1216 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1217 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1218 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1219 1220 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1221 .add(*Dest0) // Copy to same destination including flags and sub reg. 1222 .addReg(DestReg, 0, SubRegIdx0); 1223 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1224 .add(*Dest1) 1225 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1226 1227 moveInstsAfter(Copy1, InstsToMove); 1228 1229 CI.I->eraseFromParent(); 1230 Paired.I->eraseFromParent(); 1231 return New; 1232 } 1233 1234 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1235 CombineInfo &CI, CombineInfo &Paired, 1236 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1237 MachineBasicBlock *MBB = CI.I->getParent(); 1238 DebugLoc DL = CI.I->getDebugLoc(); 1239 const unsigned Opcode = getNewOpcode(CI, Paired); 1240 1241 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1242 1243 Register DestReg = MRI->createVirtualRegister(SuperRC); 1244 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1245 1246 // It shouldn't be possible to get this far if the two instructions 1247 // don't have a single memoperand, because MachineInstr::mayAlias() 1248 // will return true if this is the case. 1249 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1250 1251 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1252 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1253 1254 MachineInstr *New = 1255 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1256 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1257 .addImm(MergedOffset) // offset 1258 .addImm(CI.CPol) // cpol 1259 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1260 1261 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1262 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1263 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1264 1265 // Copy to the old destination registers. 1266 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1267 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1268 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1269 1270 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1271 .add(*Dest0) // Copy to same destination including flags and sub reg. 1272 .addReg(DestReg, 0, SubRegIdx0); 1273 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1274 .add(*Dest1) 1275 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1276 1277 moveInstsAfter(Copy1, InstsToMove); 1278 1279 CI.I->eraseFromParent(); 1280 Paired.I->eraseFromParent(); 1281 return New; 1282 } 1283 1284 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1285 CombineInfo &CI, CombineInfo &Paired, 1286 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1287 MachineBasicBlock *MBB = CI.I->getParent(); 1288 DebugLoc DL = CI.I->getDebugLoc(); 1289 1290 const unsigned Opcode = getNewOpcode(CI, Paired); 1291 1292 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1293 1294 // Copy to the new source register. 1295 Register DestReg = MRI->createVirtualRegister(SuperRC); 1296 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1297 1298 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1299 1300 AddressRegs Regs = getRegs(Opcode, *TII); 1301 1302 if (Regs.VAddr) 1303 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1304 1305 // It shouldn't be possible to get this far if the two instructions 1306 // don't have a single memoperand, because MachineInstr::mayAlias() 1307 // will return true if this is the case. 1308 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1309 1310 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1311 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1312 1313 MachineInstr *New = 1314 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1315 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1316 .addImm(MergedOffset) // offset 1317 .addImm(CI.CPol) // cpol 1318 .addImm(0) // tfe 1319 .addImm(0) // swz 1320 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1321 1322 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1323 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1324 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1325 1326 // Copy to the old destination registers. 1327 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1328 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1329 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1330 1331 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1332 .add(*Dest0) // Copy to same destination including flags and sub reg. 1333 .addReg(DestReg, 0, SubRegIdx0); 1334 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1335 .add(*Dest1) 1336 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1337 1338 moveInstsAfter(Copy1, InstsToMove); 1339 1340 CI.I->eraseFromParent(); 1341 Paired.I->eraseFromParent(); 1342 return New; 1343 } 1344 1345 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1346 CombineInfo &CI, CombineInfo &Paired, 1347 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1348 MachineBasicBlock *MBB = CI.I->getParent(); 1349 DebugLoc DL = CI.I->getDebugLoc(); 1350 1351 const unsigned Opcode = getNewOpcode(CI, Paired); 1352 1353 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1354 1355 // Copy to the new source register. 1356 Register DestReg = MRI->createVirtualRegister(SuperRC); 1357 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1358 1359 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1360 1361 AddressRegs Regs = getRegs(Opcode, *TII); 1362 1363 if (Regs.VAddr) 1364 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1365 1366 unsigned JoinedFormat = 1367 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1368 1369 // It shouldn't be possible to get this far if the two instructions 1370 // don't have a single memoperand, because MachineInstr::mayAlias() 1371 // will return true if this is the case. 1372 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1373 1374 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1375 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1376 1377 MachineInstr *New = 1378 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1379 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1380 .addImm(MergedOffset) // offset 1381 .addImm(JoinedFormat) // format 1382 .addImm(CI.CPol) // cpol 1383 .addImm(0) // tfe 1384 .addImm(0) // swz 1385 .addMemOperand( 1386 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1387 1388 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1389 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1390 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1391 1392 // Copy to the old destination registers. 1393 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1394 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1395 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1396 1397 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1398 .add(*Dest0) // Copy to same destination including flags and sub reg. 1399 .addReg(DestReg, 0, SubRegIdx0); 1400 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1401 .add(*Dest1) 1402 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1403 1404 moveInstsAfter(Copy1, InstsToMove); 1405 1406 CI.I->eraseFromParent(); 1407 Paired.I->eraseFromParent(); 1408 return New; 1409 } 1410 1411 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1412 CombineInfo &CI, CombineInfo &Paired, 1413 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1414 MachineBasicBlock *MBB = CI.I->getParent(); 1415 DebugLoc DL = CI.I->getDebugLoc(); 1416 1417 const unsigned Opcode = getNewOpcode(CI, Paired); 1418 1419 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1420 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1421 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1422 1423 // Copy to the new source register. 1424 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1425 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1426 1427 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1428 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1429 1430 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1431 .add(*Src0) 1432 .addImm(SubRegIdx0) 1433 .add(*Src1) 1434 .addImm(SubRegIdx1); 1435 1436 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1437 .addReg(SrcReg, RegState::Kill); 1438 1439 AddressRegs Regs = getRegs(Opcode, *TII); 1440 1441 if (Regs.VAddr) 1442 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1443 1444 unsigned JoinedFormat = 1445 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1446 1447 // It shouldn't be possible to get this far if the two instructions 1448 // don't have a single memoperand, because MachineInstr::mayAlias() 1449 // will return true if this is the case. 1450 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1451 1452 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1453 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1454 1455 MachineInstr *New = 1456 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1457 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1458 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1459 .addImm(JoinedFormat) // format 1460 .addImm(CI.CPol) // cpol 1461 .addImm(0) // tfe 1462 .addImm(0) // swz 1463 .addMemOperand( 1464 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1465 1466 moveInstsAfter(MIB, InstsToMove); 1467 1468 CI.I->eraseFromParent(); 1469 Paired.I->eraseFromParent(); 1470 return New; 1471 } 1472 1473 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1474 const CombineInfo &Paired) { 1475 const unsigned Width = CI.Width + Paired.Width; 1476 1477 switch (CI.InstClass) { 1478 default: 1479 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1480 // FIXME: Handle d16 correctly 1481 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1482 Width); 1483 case TBUFFER_LOAD: 1484 case TBUFFER_STORE: 1485 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1486 Width); 1487 1488 case UNKNOWN: 1489 llvm_unreachable("Unknown instruction class"); 1490 case S_BUFFER_LOAD_IMM: 1491 switch (Width) { 1492 default: 1493 return 0; 1494 case 2: 1495 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1496 case 4: 1497 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1498 case 8: 1499 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1500 } 1501 case MIMG: 1502 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1503 "No overlaps"); 1504 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1505 } 1506 } 1507 1508 std::pair<unsigned, unsigned> 1509 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1510 const CombineInfo &Paired) { 1511 bool ReverseOrder; 1512 if (CI.InstClass == MIMG) { 1513 assert( 1514 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1515 "No overlaps"); 1516 ReverseOrder = CI.DMask > Paired.DMask; 1517 } else { 1518 ReverseOrder = CI.Offset > Paired.Offset; 1519 } 1520 1521 unsigned Idx0; 1522 unsigned Idx1; 1523 1524 static const unsigned Idxs[5][4] = { 1525 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1526 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1527 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1528 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1529 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1530 }; 1531 1532 assert(CI.Width >= 1 && CI.Width <= 4); 1533 assert(Paired.Width >= 1 && Paired.Width <= 4); 1534 1535 if (ReverseOrder) { 1536 Idx1 = Idxs[0][Paired.Width - 1]; 1537 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1538 } else { 1539 Idx0 = Idxs[0][CI.Width - 1]; 1540 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1541 } 1542 1543 return std::make_pair(Idx0, Idx1); 1544 } 1545 1546 const TargetRegisterClass * 1547 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1548 const CombineInfo &Paired) { 1549 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1550 switch (CI.Width + Paired.Width) { 1551 default: 1552 return nullptr; 1553 case 2: 1554 return &AMDGPU::SReg_64_XEXECRegClass; 1555 case 4: 1556 return &AMDGPU::SGPR_128RegClass; 1557 case 8: 1558 return &AMDGPU::SGPR_256RegClass; 1559 case 16: 1560 return &AMDGPU::SGPR_512RegClass; 1561 } 1562 } 1563 1564 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1565 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1566 ? TRI->getAGPRClassForBitWidth(BitWidth) 1567 : TRI->getVGPRClassForBitWidth(BitWidth); 1568 } 1569 1570 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1571 CombineInfo &CI, CombineInfo &Paired, 1572 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1573 MachineBasicBlock *MBB = CI.I->getParent(); 1574 DebugLoc DL = CI.I->getDebugLoc(); 1575 1576 const unsigned Opcode = getNewOpcode(CI, Paired); 1577 1578 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1579 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1580 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1581 1582 // Copy to the new source register. 1583 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1584 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1585 1586 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1587 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1588 1589 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1590 .add(*Src0) 1591 .addImm(SubRegIdx0) 1592 .add(*Src1) 1593 .addImm(SubRegIdx1); 1594 1595 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1596 .addReg(SrcReg, RegState::Kill); 1597 1598 AddressRegs Regs = getRegs(Opcode, *TII); 1599 1600 if (Regs.VAddr) 1601 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1602 1603 1604 // It shouldn't be possible to get this far if the two instructions 1605 // don't have a single memoperand, because MachineInstr::mayAlias() 1606 // will return true if this is the case. 1607 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1608 1609 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1610 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1611 1612 MachineInstr *New = 1613 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1614 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1615 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1616 .addImm(CI.CPol) // cpol 1617 .addImm(0) // tfe 1618 .addImm(0) // swz 1619 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1620 1621 moveInstsAfter(MIB, InstsToMove); 1622 1623 CI.I->eraseFromParent(); 1624 Paired.I->eraseFromParent(); 1625 return New; 1626 } 1627 1628 MachineOperand 1629 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1630 APInt V(32, Val, true); 1631 if (TII->isInlineConstant(V)) 1632 return MachineOperand::CreateImm(Val); 1633 1634 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1635 MachineInstr *Mov = 1636 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1637 TII->get(AMDGPU::S_MOV_B32), Reg) 1638 .addImm(Val); 1639 (void)Mov; 1640 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1641 return MachineOperand::CreateReg(Reg, false); 1642 } 1643 1644 // Compute base address using Addr and return the final register. 1645 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1646 const MemAddress &Addr) const { 1647 MachineBasicBlock *MBB = MI.getParent(); 1648 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1649 DebugLoc DL = MI.getDebugLoc(); 1650 1651 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1652 Addr.Base.LoSubReg) && 1653 "Expected 32-bit Base-Register-Low!!"); 1654 1655 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1656 Addr.Base.HiSubReg) && 1657 "Expected 32-bit Base-Register-Hi!!"); 1658 1659 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1660 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1661 MachineOperand OffsetHi = 1662 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1663 1664 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1665 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1666 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1667 1668 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1669 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1670 MachineInstr *LoHalf = 1671 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1672 .addReg(CarryReg, RegState::Define) 1673 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1674 .add(OffsetLo) 1675 .addImm(0); // clamp bit 1676 (void)LoHalf; 1677 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1678 1679 MachineInstr *HiHalf = 1680 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1681 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1682 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1683 .add(OffsetHi) 1684 .addReg(CarryReg, RegState::Kill) 1685 .addImm(0); // clamp bit 1686 (void)HiHalf; 1687 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1688 1689 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1690 MachineInstr *FullBase = 1691 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1692 .addReg(DestSub0) 1693 .addImm(AMDGPU::sub0) 1694 .addReg(DestSub1) 1695 .addImm(AMDGPU::sub1); 1696 (void)FullBase; 1697 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1698 1699 return FullDestReg; 1700 } 1701 1702 // Update base and offset with the NewBase and NewOffset in MI. 1703 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1704 Register NewBase, 1705 int32_t NewOffset) const { 1706 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1707 Base->setReg(NewBase); 1708 Base->setIsKill(false); 1709 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1710 } 1711 1712 Optional<int32_t> 1713 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1714 if (Op.isImm()) 1715 return Op.getImm(); 1716 1717 if (!Op.isReg()) 1718 return None; 1719 1720 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1721 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1722 !Def->getOperand(1).isImm()) 1723 return None; 1724 1725 return Def->getOperand(1).getImm(); 1726 } 1727 1728 // Analyze Base and extracts: 1729 // - 32bit base registers, subregisters 1730 // - 64bit constant offset 1731 // Expecting base computation as: 1732 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1733 // %LO:vgpr_32, %c:sreg_64_xexec = 1734 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1735 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1736 // %Base:vreg_64 = 1737 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1738 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1739 MemAddress &Addr) const { 1740 if (!Base.isReg()) 1741 return; 1742 1743 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1744 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1745 || Def->getNumOperands() != 5) 1746 return; 1747 1748 MachineOperand BaseLo = Def->getOperand(1); 1749 MachineOperand BaseHi = Def->getOperand(3); 1750 if (!BaseLo.isReg() || !BaseHi.isReg()) 1751 return; 1752 1753 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1754 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1755 1756 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1757 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1758 return; 1759 1760 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1761 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1762 1763 auto Offset0P = extractConstOffset(*Src0); 1764 if (Offset0P) 1765 BaseLo = *Src1; 1766 else { 1767 if (!(Offset0P = extractConstOffset(*Src1))) 1768 return; 1769 BaseLo = *Src0; 1770 } 1771 1772 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1773 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1774 1775 if (Src0->isImm()) 1776 std::swap(Src0, Src1); 1777 1778 if (!Src1->isImm()) 1779 return; 1780 1781 uint64_t Offset1 = Src1->getImm(); 1782 BaseHi = *Src0; 1783 1784 Addr.Base.LoReg = BaseLo.getReg(); 1785 Addr.Base.HiReg = BaseHi.getReg(); 1786 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1787 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1788 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1789 } 1790 1791 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1792 MachineInstr &MI, 1793 MemInfoMap &Visited, 1794 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1795 1796 if (!(MI.mayLoad() ^ MI.mayStore())) 1797 return false; 1798 1799 // TODO: Support flat and scratch. 1800 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1801 return false; 1802 1803 if (MI.mayLoad() && 1804 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1805 return false; 1806 1807 if (AnchorList.count(&MI)) 1808 return false; 1809 1810 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1811 1812 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1813 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1814 return false; 1815 } 1816 1817 // Step1: Find the base-registers and a 64bit constant offset. 1818 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1819 MemAddress MAddr; 1820 if (Visited.find(&MI) == Visited.end()) { 1821 processBaseWithConstOffset(Base, MAddr); 1822 Visited[&MI] = MAddr; 1823 } else 1824 MAddr = Visited[&MI]; 1825 1826 if (MAddr.Offset == 0) { 1827 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1828 " constant offsets that can be promoted.\n";); 1829 return false; 1830 } 1831 1832 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1833 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1834 1835 // Step2: Traverse through MI's basic block and find an anchor(that has the 1836 // same base-registers) with the highest 13bit distance from MI's offset. 1837 // E.g. (64bit loads) 1838 // bb: 1839 // addr1 = &a + 4096; load1 = load(addr1, 0) 1840 // addr2 = &a + 6144; load2 = load(addr2, 0) 1841 // addr3 = &a + 8192; load3 = load(addr3, 0) 1842 // addr4 = &a + 10240; load4 = load(addr4, 0) 1843 // addr5 = &a + 12288; load5 = load(addr5, 0) 1844 // 1845 // Starting from the first load, the optimization will try to find a new base 1846 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1847 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1848 // as the new-base(anchor) because of the maximum distance which can 1849 // accomodate more intermediate bases presumeably. 1850 // 1851 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1852 // (&a + 8192) for load1, load2, load4. 1853 // addr = &a + 8192 1854 // load1 = load(addr, -4096) 1855 // load2 = load(addr, -2048) 1856 // load3 = load(addr, 0) 1857 // load4 = load(addr, 2048) 1858 // addr5 = &a + 12288; load5 = load(addr5, 0) 1859 // 1860 MachineInstr *AnchorInst = nullptr; 1861 MemAddress AnchorAddr; 1862 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1863 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1864 1865 MachineBasicBlock *MBB = MI.getParent(); 1866 MachineBasicBlock::iterator E = MBB->end(); 1867 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1868 ++MBBI; 1869 const SITargetLowering *TLI = 1870 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1871 1872 for ( ; MBBI != E; ++MBBI) { 1873 MachineInstr &MINext = *MBBI; 1874 // TODO: Support finding an anchor(with same base) from store addresses or 1875 // any other load addresses where the opcodes are different. 1876 if (MINext.getOpcode() != MI.getOpcode() || 1877 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1878 continue; 1879 1880 const MachineOperand &BaseNext = 1881 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1882 MemAddress MAddrNext; 1883 if (Visited.find(&MINext) == Visited.end()) { 1884 processBaseWithConstOffset(BaseNext, MAddrNext); 1885 Visited[&MINext] = MAddrNext; 1886 } else 1887 MAddrNext = Visited[&MINext]; 1888 1889 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1890 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1891 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1892 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1893 continue; 1894 1895 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1896 1897 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1898 TargetLoweringBase::AddrMode AM; 1899 AM.HasBaseReg = true; 1900 AM.BaseOffs = Dist; 1901 if (TLI->isLegalGlobalAddressingMode(AM) && 1902 (uint32_t)std::abs(Dist) > MaxDist) { 1903 MaxDist = std::abs(Dist); 1904 1905 AnchorAddr = MAddrNext; 1906 AnchorInst = &MINext; 1907 } 1908 } 1909 1910 if (AnchorInst) { 1911 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1912 AnchorInst->dump()); 1913 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1914 << AnchorAddr.Offset << "\n\n"); 1915 1916 // Instead of moving up, just re-compute anchor-instruction's base address. 1917 Register Base = computeBase(MI, AnchorAddr); 1918 1919 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1920 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1921 1922 for (auto P : InstsWCommonBase) { 1923 TargetLoweringBase::AddrMode AM; 1924 AM.HasBaseReg = true; 1925 AM.BaseOffs = P.second - AnchorAddr.Offset; 1926 1927 if (TLI->isLegalGlobalAddressingMode(AM)) { 1928 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1929 dbgs() << ")"; P.first->dump()); 1930 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1931 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1932 } 1933 } 1934 AnchorList.insert(AnchorInst); 1935 return true; 1936 } 1937 1938 return false; 1939 } 1940 1941 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1942 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1943 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1944 if (AddrList.front().InstClass == CI.InstClass && 1945 AddrList.front().IsAGPR == CI.IsAGPR && 1946 AddrList.front().hasSameBaseAddress(*CI.I)) { 1947 AddrList.emplace_back(CI); 1948 return; 1949 } 1950 } 1951 1952 // Base address not found, so add a new list. 1953 MergeableInsts.emplace_back(1, CI); 1954 } 1955 1956 std::pair<MachineBasicBlock::iterator, bool> 1957 SILoadStoreOptimizer::collectMergeableInsts( 1958 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1959 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1960 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1961 bool Modified = false; 1962 1963 // Sort potential mergeable instructions into lists. One list per base address. 1964 unsigned Order = 0; 1965 MachineBasicBlock::iterator BlockI = Begin; 1966 for (; BlockI != End; ++BlockI) { 1967 MachineInstr &MI = *BlockI; 1968 1969 // We run this before checking if an address is mergeable, because it can produce 1970 // better code even if the instructions aren't mergeable. 1971 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1972 Modified = true; 1973 1974 // Treat volatile accesses, ordered accesses and unmodeled side effects as 1975 // barriers. We can look after this barrier for separate merges. 1976 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 1977 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 1978 1979 // Search will resume after this instruction in a separate merge list. 1980 ++BlockI; 1981 break; 1982 } 1983 1984 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1985 if (InstClass == UNKNOWN) 1986 continue; 1987 1988 // Do not merge VMEM buffer instructions with "swizzled" bit set. 1989 int Swizzled = 1990 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 1991 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 1992 continue; 1993 1994 CombineInfo CI; 1995 CI.setMI(MI, *this); 1996 CI.Order = Order++; 1997 1998 if (!CI.hasMergeableAddress(*MRI)) 1999 continue; 2000 2001 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2002 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2003 // operands. However we are reporting that ds_write2 shall have 2004 // only VGPR data so that machine copy propagation does not 2005 // create an illegal instruction with a VGPR and AGPR sources. 2006 // Consequenctially if we create such instruction the verifier 2007 // will complain. 2008 continue; 2009 } 2010 2011 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2012 2013 addInstToMergeableList(CI, MergeableInsts); 2014 } 2015 2016 // At this point we have lists of Mergeable instructions. 2017 // 2018 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2019 // list try to find an instruction that can be merged with I. If an instruction 2020 // is found, it is stored in the Paired field. If no instructions are found, then 2021 // the CombineInfo object is deleted from the list. 2022 2023 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2024 E = MergeableInsts.end(); I != E;) { 2025 2026 std::list<CombineInfo> &MergeList = *I; 2027 if (MergeList.size() <= 1) { 2028 // This means we have found only one instruction with a given address 2029 // that can be merged, and we need at least 2 instructions to do a merge, 2030 // so this list can be discarded. 2031 I = MergeableInsts.erase(I); 2032 continue; 2033 } 2034 2035 // Sort the lists by offsets, this way mergeable instructions will be 2036 // adjacent to each other in the list, which will make it easier to find 2037 // matches. 2038 MergeList.sort( 2039 [] (const CombineInfo &A, const CombineInfo &B) { 2040 return A.Offset < B.Offset; 2041 }); 2042 ++I; 2043 } 2044 2045 return std::make_pair(BlockI, Modified); 2046 } 2047 2048 // Scan through looking for adjacent LDS operations with constant offsets from 2049 // the same base register. We rely on the scheduler to do the hard work of 2050 // clustering nearby loads, and assume these are all adjacent. 2051 bool SILoadStoreOptimizer::optimizeBlock( 2052 std::list<std::list<CombineInfo> > &MergeableInsts) { 2053 bool Modified = false; 2054 2055 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2056 E = MergeableInsts.end(); I != E;) { 2057 std::list<CombineInfo> &MergeList = *I; 2058 2059 bool OptimizeListAgain = false; 2060 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2061 // We weren't able to make any changes, so delete the list so we don't 2062 // process the same instructions the next time we try to optimize this 2063 // block. 2064 I = MergeableInsts.erase(I); 2065 continue; 2066 } 2067 2068 Modified = true; 2069 2070 // We made changes, but also determined that there were no more optimization 2071 // opportunities, so we don't need to reprocess the list 2072 if (!OptimizeListAgain) { 2073 I = MergeableInsts.erase(I); 2074 continue; 2075 } 2076 OptimizeAgain = true; 2077 } 2078 return Modified; 2079 } 2080 2081 bool 2082 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2083 std::list<CombineInfo> &MergeList, 2084 bool &OptimizeListAgain) { 2085 if (MergeList.empty()) 2086 return false; 2087 2088 bool Modified = false; 2089 2090 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2091 Next = std::next(I)) { 2092 2093 auto First = I; 2094 auto Second = Next; 2095 2096 if ((*First).Order > (*Second).Order) 2097 std::swap(First, Second); 2098 CombineInfo &CI = *First; 2099 CombineInfo &Paired = *Second; 2100 2101 SmallVector<MachineInstr *, 8> InstsToMove; 2102 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2103 ++I; 2104 continue; 2105 } 2106 2107 Modified = true; 2108 2109 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2110 2111 switch (CI.InstClass) { 2112 default: 2113 llvm_unreachable("unknown InstClass"); 2114 break; 2115 case DS_READ: { 2116 MachineBasicBlock::iterator NewMI = 2117 mergeRead2Pair(CI, Paired, InstsToMove); 2118 CI.setMI(NewMI, *this); 2119 break; 2120 } 2121 case DS_WRITE: { 2122 MachineBasicBlock::iterator NewMI = 2123 mergeWrite2Pair(CI, Paired, InstsToMove); 2124 CI.setMI(NewMI, *this); 2125 break; 2126 } 2127 case S_BUFFER_LOAD_IMM: { 2128 MachineBasicBlock::iterator NewMI = 2129 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2130 CI.setMI(NewMI, *this); 2131 OptimizeListAgain |= (CI.Width + Paired.Width) < 8; 2132 break; 2133 } 2134 case BUFFER_LOAD: { 2135 MachineBasicBlock::iterator NewMI = 2136 mergeBufferLoadPair(CI, Paired, InstsToMove); 2137 CI.setMI(NewMI, *this); 2138 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2139 break; 2140 } 2141 case BUFFER_STORE: { 2142 MachineBasicBlock::iterator NewMI = 2143 mergeBufferStorePair(CI, Paired, InstsToMove); 2144 CI.setMI(NewMI, *this); 2145 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2146 break; 2147 } 2148 case MIMG: { 2149 MachineBasicBlock::iterator NewMI = 2150 mergeImagePair(CI, Paired, InstsToMove); 2151 CI.setMI(NewMI, *this); 2152 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2153 break; 2154 } 2155 case TBUFFER_LOAD: { 2156 MachineBasicBlock::iterator NewMI = 2157 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2158 CI.setMI(NewMI, *this); 2159 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2160 break; 2161 } 2162 case TBUFFER_STORE: { 2163 MachineBasicBlock::iterator NewMI = 2164 mergeTBufferStorePair(CI, Paired, InstsToMove); 2165 CI.setMI(NewMI, *this); 2166 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2167 break; 2168 } 2169 } 2170 CI.Order = Paired.Order; 2171 if (I == Second) 2172 I = Next; 2173 2174 MergeList.erase(Second); 2175 } 2176 2177 return Modified; 2178 } 2179 2180 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2181 if (skipFunction(MF.getFunction())) 2182 return false; 2183 2184 STM = &MF.getSubtarget<GCNSubtarget>(); 2185 if (!STM->loadStoreOptEnabled()) 2186 return false; 2187 2188 TII = STM->getInstrInfo(); 2189 TRI = &TII->getRegisterInfo(); 2190 2191 MRI = &MF.getRegInfo(); 2192 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2193 2194 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2195 2196 bool Modified = false; 2197 2198 // Contains the list of instructions for which constant offsets are being 2199 // promoted to the IMM. This is tracked for an entire block at time. 2200 SmallPtrSet<MachineInstr *, 4> AnchorList; 2201 MemInfoMap Visited; 2202 2203 for (MachineBasicBlock &MBB : MF) { 2204 MachineBasicBlock::iterator SectionEnd; 2205 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2206 I = SectionEnd) { 2207 bool CollectModified; 2208 std::list<std::list<CombineInfo>> MergeableInsts; 2209 2210 // First pass: Collect list of all instructions we know how to merge in a 2211 // subset of the block. 2212 std::tie(SectionEnd, CollectModified) = 2213 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2214 2215 Modified |= CollectModified; 2216 2217 do { 2218 OptimizeAgain = false; 2219 Modified |= optimizeBlock(MergeableInsts); 2220 } while (OptimizeAgain); 2221 } 2222 2223 Visited.clear(); 2224 AnchorList.clear(); 2225 } 2226 2227 return Modified; 2228 } 2229